From b4031611e299d0038a1ae680e9503d0d461f9a91 Mon Sep 17 00:00:00 2001 From: Nico Date: Fri, 3 Apr 2026 18:18:09 +0200 Subject: [PATCH] Add --repeat=N mode to test runner with timing stats (avg/p50/p95) - run_tests.py: --repeat=N runs each test N times, aggregates into one result - Stats include: runs, pass_rate, min/avg/p50/p95/max_ms - Stats posted in result.stats field for dashboard display - Works with all suites (engine, api, matrix, roundtrip) Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/run_tests.py | 124 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 21 deletions(-) diff --git a/tests/run_tests.py b/tests/run_tests.py index 275e214..5dbdc8b 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -3,12 +3,11 @@ Test orchestrator — runs test suites and posts results to dev assay. Usage: - python tests/run_all_tests.py # all suites - python tests/run_all_tests.py api # one suite - python tests/run_all_tests.py roundtrip # one suite - python tests/run_all_tests.py api/health # single test - python tests/run_all_tests.py roundtrip/full_eras # single test - python tests/run_all_tests.py api/health roundtrip/full_chat # multiple tests + python tests/run_tests.py # all suites + python tests/run_tests.py api # one suite + python tests/run_tests.py matrix/eras_query[haiku] # single test + python tests/run_tests.py matrix --repeat=3 # each test 3x, report avg/p50/p95 + python tests/run_tests.py api/health roundtrip/full_chat # multiple tests Test names: suite/name (without the suite prefix in the test registry). engine tests: graph_load, node_instantiation, edge_types_complete, @@ -46,6 +45,7 @@ class TestResult: duration_ms: float = 0 error: str = '' ts: str = '' + stats: dict = field(default_factory=dict) # {runs, min_ms, avg_ms, p50_ms, p95_ms, max_ms, pass_rate} def post_result(result: TestResult): @@ -125,29 +125,100 @@ SUITES = { } -def parse_filters(args: list[str]) -> tuple[set[str] | None, set[str]]: - """Parse CLI args into (suite_filter, test_filter). +def _compute_stats(durations: list[float], passed: int, total: int) -> dict: + """Compute timing stats from a list of durations.""" + if not durations: + return {} + durations.sort() + n = len(durations) + return { + 'runs': total, + 'passed': passed, + 'pass_rate': round(100 * passed / total) if total else 0, + 'min_ms': round(durations[0]), + 'avg_ms': round(sum(durations) / n), + 'p50_ms': round(durations[n // 2]), + 'p95_ms': round(durations[min(int(n * 0.95), n - 1)]), + 'max_ms': round(durations[-1]), + } + + +def run_test_repeated(name: str, suite: str, fn, repeat: int) -> TestResult: + """Run a test N times, aggregate timing stats into one result.""" + # Post running status + result = TestResult(run_id=RUN_ID, test=name, suite=suite, status='running', ts=_now_iso()) + post_result(result) + + durations = [] + passed_count = 0 + last_error = '' + + for i in range(repeat): + start = time.time() + try: + fn() + elapsed = round((time.time() - start) * 1000) + durations.append(elapsed) + passed_count += 1 + except (AssertionError, Exception) as e: + elapsed = round((time.time() - start) * 1000) + durations.append(elapsed) + last_error = str(e)[:200] + + stats = _compute_stats(durations, passed_count, repeat) + result.stats = stats + result.duration_ms = stats.get('avg_ms', 0) + result.status = 'pass' if passed_count == repeat else ('fail' if passed_count > 0 else 'error') + result.error = f'{stats["pass_rate"]}% pass, avg={stats["avg_ms"]}ms p50={stats["p50_ms"]}ms p95={stats["p95_ms"]}ms' + if last_error and passed_count < repeat: + result.error += f' | last err: {last_error}' + result.ts = _now_iso() + post_result(result) + return result + + +def parse_args(args: list[str]) -> tuple[set[str] | None, set[str], int]: + """Parse CLI args into (suite_filter, test_filter, repeat). + + Supports: --repeat=N or --repeat N Returns: suite_filter: set of suite names, or None for all suites test_filter: set of 'suite/test' names (empty = run all in suite) + repeat: number of times to run each test (default 1) """ - if not args: - return None, set() + repeat = 1 + filtered_args = [] + skip_next = False + for i, arg in enumerate(args): + if skip_next: + skip_next = False + continue + if arg.startswith('--repeat='): + repeat = int(arg.split('=', 1)[1]) + elif arg == '--repeat' and i + 1 < len(args): + repeat = int(args[i + 1]) + skip_next = True + else: + filtered_args.append(arg) + + if not filtered_args: + return None, set(), repeat suites = set() tests = set() - for arg in args: + for arg in filtered_args: if '/' in arg: tests.add(arg) suites.add(arg.split('/')[0]) else: suites.add(arg) - return suites, tests + return suites, tests, repeat -def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestResult]: - """Run tests from a suite, optionally filtered.""" +def run_suite(suite_name: str, tests: dict, test_filter: set[str], + repeat: int = 1) -> list[TestResult]: + """Run tests from a suite, optionally filtered and repeated.""" results = [] for name, fn in tests.items(): # Apply test filter if specified @@ -157,21 +228,32 @@ def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestR if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter: continue - r = run_test(name, suite_name, fn) + if repeat > 1: + r = run_test_repeated(name, suite_name, fn, repeat) + status = 'PASS' if r.status == 'pass' else 'FAIL' + stats = r.stats + print(f' [{status}] {suite_name}/{name} ×{repeat} ' + f'(avg={stats.get("avg_ms", 0)}ms p50={stats.get("p50_ms", 0)}ms ' + f'p95={stats.get("p95_ms", 0)}ms pass={stats.get("pass_rate", 0)}%)', flush=True) + else: + r = run_test(name, suite_name, fn) + status = 'PASS' if r.status == 'pass' else 'FAIL' + print(f' [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True) + results.append(r) - status = 'PASS' if r.status == 'pass' else 'FAIL' - print(f' [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True) - if r.error: + if r.error and repeat == 1: print(f' {r.error[:200]}', flush=True) return results def main(): - suite_filter, test_filter = parse_filters(sys.argv[1:]) + suite_filter, test_filter, repeat = parse_args(sys.argv[1:]) print(f'=== Test Run {RUN_ID} ===', flush=True) if suite_filter: print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True) + if repeat > 1: + print(f'Repeat: {repeat}x per test', flush=True) print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True) print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True) print(flush=True) @@ -181,9 +263,9 @@ def main(): for suite_name, loader in SUITES.items(): if suite_filter and suite_name not in suite_filter: continue - print(f'--- {suite_name} ---', flush=True) + print(f'--- {suite_name}{" ×" + str(repeat) if repeat > 1 else ""} ---', flush=True) tests = loader() - all_results.extend(run_suite(suite_name, tests, test_filter)) + all_results.extend(run_suite(suite_name, tests, test_filter, repeat)) print(flush=True) # Summary