Add --repeat=N mode to test runner with timing stats (avg/p50/p95)
- run_tests.py: --repeat=N runs each test N times, aggregates into one result - Stats include: runs, pass_rate, min/avg/p50/p95/max_ms - Stats posted in result.stats field for dashboard display - Works with all suites (engine, api, matrix, roundtrip) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4e679a3ad9
commit
b4031611e2
@ -3,12 +3,11 @@
|
|||||||
Test orchestrator — runs test suites and posts results to dev assay.
|
Test orchestrator — runs test suites and posts results to dev assay.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python tests/run_all_tests.py # all suites
|
python tests/run_tests.py # all suites
|
||||||
python tests/run_all_tests.py api # one suite
|
python tests/run_tests.py api # one suite
|
||||||
python tests/run_all_tests.py roundtrip # one suite
|
python tests/run_tests.py matrix/eras_query[haiku] # single test
|
||||||
python tests/run_all_tests.py api/health # single test
|
python tests/run_tests.py matrix --repeat=3 # each test 3x, report avg/p50/p95
|
||||||
python tests/run_all_tests.py roundtrip/full_eras # single test
|
python tests/run_tests.py api/health roundtrip/full_chat # multiple tests
|
||||||
python tests/run_all_tests.py api/health roundtrip/full_chat # multiple tests
|
|
||||||
|
|
||||||
Test names: suite/name (without the suite prefix in the test registry).
|
Test names: suite/name (without the suite prefix in the test registry).
|
||||||
engine tests: graph_load, node_instantiation, edge_types_complete,
|
engine tests: graph_load, node_instantiation, edge_types_complete,
|
||||||
@ -46,6 +45,7 @@ class TestResult:
|
|||||||
duration_ms: float = 0
|
duration_ms: float = 0
|
||||||
error: str = ''
|
error: str = ''
|
||||||
ts: str = ''
|
ts: str = ''
|
||||||
|
stats: dict = field(default_factory=dict) # {runs, min_ms, avg_ms, p50_ms, p95_ms, max_ms, pass_rate}
|
||||||
|
|
||||||
|
|
||||||
def post_result(result: TestResult):
|
def post_result(result: TestResult):
|
||||||
@ -125,29 +125,100 @@ SUITES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_filters(args: list[str]) -> tuple[set[str] | None, set[str]]:
|
def _compute_stats(durations: list[float], passed: int, total: int) -> dict:
|
||||||
"""Parse CLI args into (suite_filter, test_filter).
|
"""Compute timing stats from a list of durations."""
|
||||||
|
if not durations:
|
||||||
|
return {}
|
||||||
|
durations.sort()
|
||||||
|
n = len(durations)
|
||||||
|
return {
|
||||||
|
'runs': total,
|
||||||
|
'passed': passed,
|
||||||
|
'pass_rate': round(100 * passed / total) if total else 0,
|
||||||
|
'min_ms': round(durations[0]),
|
||||||
|
'avg_ms': round(sum(durations) / n),
|
||||||
|
'p50_ms': round(durations[n // 2]),
|
||||||
|
'p95_ms': round(durations[min(int(n * 0.95), n - 1)]),
|
||||||
|
'max_ms': round(durations[-1]),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_test_repeated(name: str, suite: str, fn, repeat: int) -> TestResult:
|
||||||
|
"""Run a test N times, aggregate timing stats into one result."""
|
||||||
|
# Post running status
|
||||||
|
result = TestResult(run_id=RUN_ID, test=name, suite=suite, status='running', ts=_now_iso())
|
||||||
|
post_result(result)
|
||||||
|
|
||||||
|
durations = []
|
||||||
|
passed_count = 0
|
||||||
|
last_error = ''
|
||||||
|
|
||||||
|
for i in range(repeat):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
fn()
|
||||||
|
elapsed = round((time.time() - start) * 1000)
|
||||||
|
durations.append(elapsed)
|
||||||
|
passed_count += 1
|
||||||
|
except (AssertionError, Exception) as e:
|
||||||
|
elapsed = round((time.time() - start) * 1000)
|
||||||
|
durations.append(elapsed)
|
||||||
|
last_error = str(e)[:200]
|
||||||
|
|
||||||
|
stats = _compute_stats(durations, passed_count, repeat)
|
||||||
|
result.stats = stats
|
||||||
|
result.duration_ms = stats.get('avg_ms', 0)
|
||||||
|
result.status = 'pass' if passed_count == repeat else ('fail' if passed_count > 0 else 'error')
|
||||||
|
result.error = f'{stats["pass_rate"]}% pass, avg={stats["avg_ms"]}ms p50={stats["p50_ms"]}ms p95={stats["p95_ms"]}ms'
|
||||||
|
if last_error and passed_count < repeat:
|
||||||
|
result.error += f' | last err: {last_error}'
|
||||||
|
result.ts = _now_iso()
|
||||||
|
post_result(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(args: list[str]) -> tuple[set[str] | None, set[str], int]:
|
||||||
|
"""Parse CLI args into (suite_filter, test_filter, repeat).
|
||||||
|
|
||||||
|
Supports: --repeat=N or --repeat N
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
suite_filter: set of suite names, or None for all suites
|
suite_filter: set of suite names, or None for all suites
|
||||||
test_filter: set of 'suite/test' names (empty = run all in suite)
|
test_filter: set of 'suite/test' names (empty = run all in suite)
|
||||||
|
repeat: number of times to run each test (default 1)
|
||||||
"""
|
"""
|
||||||
if not args:
|
repeat = 1
|
||||||
return None, set()
|
filtered_args = []
|
||||||
|
skip_next = False
|
||||||
|
for i, arg in enumerate(args):
|
||||||
|
if skip_next:
|
||||||
|
skip_next = False
|
||||||
|
continue
|
||||||
|
if arg.startswith('--repeat='):
|
||||||
|
repeat = int(arg.split('=', 1)[1])
|
||||||
|
elif arg == '--repeat' and i + 1 < len(args):
|
||||||
|
repeat = int(args[i + 1])
|
||||||
|
skip_next = True
|
||||||
|
else:
|
||||||
|
filtered_args.append(arg)
|
||||||
|
|
||||||
|
if not filtered_args:
|
||||||
|
return None, set(), repeat
|
||||||
|
|
||||||
suites = set()
|
suites = set()
|
||||||
tests = set()
|
tests = set()
|
||||||
for arg in args:
|
for arg in filtered_args:
|
||||||
if '/' in arg:
|
if '/' in arg:
|
||||||
tests.add(arg)
|
tests.add(arg)
|
||||||
suites.add(arg.split('/')[0])
|
suites.add(arg.split('/')[0])
|
||||||
else:
|
else:
|
||||||
suites.add(arg)
|
suites.add(arg)
|
||||||
return suites, tests
|
return suites, tests, repeat
|
||||||
|
|
||||||
|
|
||||||
def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestResult]:
|
def run_suite(suite_name: str, tests: dict, test_filter: set[str],
|
||||||
"""Run tests from a suite, optionally filtered."""
|
repeat: int = 1) -> list[TestResult]:
|
||||||
|
"""Run tests from a suite, optionally filtered and repeated."""
|
||||||
results = []
|
results = []
|
||||||
for name, fn in tests.items():
|
for name, fn in tests.items():
|
||||||
# Apply test filter if specified
|
# Apply test filter if specified
|
||||||
@ -157,21 +228,32 @@ def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestR
|
|||||||
if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter:
|
if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
r = run_test(name, suite_name, fn)
|
if repeat > 1:
|
||||||
|
r = run_test_repeated(name, suite_name, fn, repeat)
|
||||||
|
status = 'PASS' if r.status == 'pass' else 'FAIL'
|
||||||
|
stats = r.stats
|
||||||
|
print(f' [{status}] {suite_name}/{name} ×{repeat} '
|
||||||
|
f'(avg={stats.get("avg_ms", 0)}ms p50={stats.get("p50_ms", 0)}ms '
|
||||||
|
f'p95={stats.get("p95_ms", 0)}ms pass={stats.get("pass_rate", 0)}%)', flush=True)
|
||||||
|
else:
|
||||||
|
r = run_test(name, suite_name, fn)
|
||||||
|
status = 'PASS' if r.status == 'pass' else 'FAIL'
|
||||||
|
print(f' [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
|
||||||
|
|
||||||
results.append(r)
|
results.append(r)
|
||||||
status = 'PASS' if r.status == 'pass' else 'FAIL'
|
if r.error and repeat == 1:
|
||||||
print(f' [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
|
|
||||||
if r.error:
|
|
||||||
print(f' {r.error[:200]}', flush=True)
|
print(f' {r.error[:200]}', flush=True)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
suite_filter, test_filter = parse_filters(sys.argv[1:])
|
suite_filter, test_filter, repeat = parse_args(sys.argv[1:])
|
||||||
|
|
||||||
print(f'=== Test Run {RUN_ID} ===', flush=True)
|
print(f'=== Test Run {RUN_ID} ===', flush=True)
|
||||||
if suite_filter:
|
if suite_filter:
|
||||||
print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True)
|
print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True)
|
||||||
|
if repeat > 1:
|
||||||
|
print(f'Repeat: {repeat}x per test', flush=True)
|
||||||
print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True)
|
print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True)
|
||||||
print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True)
|
print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True)
|
||||||
print(flush=True)
|
print(flush=True)
|
||||||
@ -181,9 +263,9 @@ def main():
|
|||||||
for suite_name, loader in SUITES.items():
|
for suite_name, loader in SUITES.items():
|
||||||
if suite_filter and suite_name not in suite_filter:
|
if suite_filter and suite_name not in suite_filter:
|
||||||
continue
|
continue
|
||||||
print(f'--- {suite_name} ---', flush=True)
|
print(f'--- {suite_name}{" ×" + str(repeat) if repeat > 1 else ""} ---', flush=True)
|
||||||
tests = loader()
|
tests = loader()
|
||||||
all_results.extend(run_suite(suite_name, tests, test_filter))
|
all_results.extend(run_suite(suite_name, tests, test_filter, repeat))
|
||||||
print(flush=True)
|
print(flush=True)
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
|
|||||||
Reference in New Issue
Block a user