This repository has been archived on 2026-04-03. You can view files and clone it, but cannot push or open issues or pull requests.
agent-runtime/tests/run_tests.py
Nico 4e679a3ad9 Add model matrix test suite: 3 tests × 3 variants = 9 combos
New 'matrix' suite runs same API tests with different LLM model configs:
- Variants: gemini-flash (baseline), haiku, gpt-4o-mini
- Tests: eras_query (SQL correctness), eras_artifact (data output), social_reflex (fast path)
- Posts results as test_name[variant] to /tests dashboard
- All 9 combos passing (6/9 verified locally, ~35s for ERAS tests)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 18:12:24 +02:00

209 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test orchestrator — runs test suites and posts results to dev assay.
Usage:
python tests/run_all_tests.py # all suites
python tests/run_all_tests.py api # one suite
python tests/run_all_tests.py roundtrip # one suite
python tests/run_all_tests.py api/health # single test
python tests/run_all_tests.py roundtrip/full_eras # single test
python tests/run_all_tests.py api/health roundtrip/full_chat # multiple tests
Test names: suite/name (without the suite prefix in the test registry).
engine tests: graph_load, node_instantiation, edge_types_complete,
condition_reflex, condition_tool_output,
frame_trace_reflex, frame_trace_expert, frame_trace_expert_with_interpreter
api tests: health, eras_umsatz_api, eras_umsatz_artifact
matrix tests: eras_query[variant], eras_artifact[variant], social_reflex[variant]
variants: gemini-flash, haiku, gpt-4o-mini
roundtrip tests: nyx_loads, inject_artifact, inject_message, full_chat, full_eras
"""
import json
import os
import sys
import time
import urllib.request
import uuid
from datetime import datetime, timezone
from dataclasses import dataclass, field, asdict
RESULTS_ENDPOINT = os.environ.get('RESULTS_ENDPOINT', '')
RUN_ID = os.environ.get('RUN_ID', str(uuid.uuid4())[:8])
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
@dataclass
class TestResult:
run_id: str
test: str
suite: str
status: str # 'pass', 'fail', 'running', 'error'
duration_ms: float = 0
error: str = ''
ts: str = ''
def post_result(result: TestResult):
"""Post a single test result to the dev assay endpoint."""
print(json.dumps(asdict(result)), flush=True)
if not RESULTS_ENDPOINT:
return
try:
payload = json.dumps(asdict(result)).encode()
req = urllib.request.Request(
RESULTS_ENDPOINT,
data=payload,
headers={'Content-Type': 'application/json'},
)
urllib.request.urlopen(req, timeout=5)
except Exception as e:
print(f' [warn] failed to post result: {e}', file=sys.stderr)
def run_test(name: str, suite: str, fn) -> TestResult:
"""Run a single test function and return the result."""
result = TestResult(run_id=RUN_ID, test=name, suite=suite, status='running', ts=_now_iso())
post_result(result)
start = time.time()
try:
fn()
result.status = 'pass'
except AssertionError as e:
result.status = 'fail'
result.error = str(e)
except Exception as e:
result.status = 'error'
result.error = f'{type(e).__name__}: {e}'
result.duration_ms = round((time.time() - start) * 1000)
result.ts = _now_iso()
post_result(result)
return result
def get_api_tests() -> dict:
"""Load API tests from e2e_harness.py."""
sys.path.insert(0, os.path.dirname(__file__))
import e2e_harness
e2e_harness.ASSAY_BASE = os.environ.get('ASSAY_API', 'http://assay-runtime-test:8000').rstrip('/api')
# Skip browser-dependent tests
return {k: v for k, v in e2e_harness.TESTS.items() if 'takeover' not in k and 'panes' not in k}
def get_roundtrip_tests() -> dict:
"""Load Playwright roundtrip tests."""
sys.path.insert(0, os.path.dirname(__file__))
from test_roundtrip import TESTS
return TESTS
def get_engine_tests() -> dict:
"""Load engine-level tests (no LLM, no network)."""
sys.path.insert(0, os.path.dirname(__file__))
from test_engine import TESTS
return TESTS
def get_matrix_tests() -> dict:
"""Load model matrix tests (real LLM calls, test×variant combos)."""
sys.path.insert(0, os.path.dirname(__file__))
from test_matrix import get_matrix_tests
return get_matrix_tests()
SUITES = {
'engine': get_engine_tests,
'api': get_api_tests,
'matrix': get_matrix_tests,
'roundtrip': get_roundtrip_tests,
}
def parse_filters(args: list[str]) -> tuple[set[str] | None, set[str]]:
"""Parse CLI args into (suite_filter, test_filter).
Returns:
suite_filter: set of suite names, or None for all suites
test_filter: set of 'suite/test' names (empty = run all in suite)
"""
if not args:
return None, set()
suites = set()
tests = set()
for arg in args:
if '/' in arg:
tests.add(arg)
suites.add(arg.split('/')[0])
else:
suites.add(arg)
return suites, tests
def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestResult]:
"""Run tests from a suite, optionally filtered."""
results = []
for name, fn in tests.items():
# Apply test filter if specified
full_name = f'{suite_name}/{name}'
# Strip suite prefix for matching (roundtrip/full_eras matches roundtrip_full_eras)
short_name = name.replace(f'{suite_name}_', '')
if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter:
continue
r = run_test(name, suite_name, fn)
results.append(r)
status = 'PASS' if r.status == 'pass' else 'FAIL'
print(f' [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
if r.error:
print(f' {r.error[:200]}', flush=True)
return results
def main():
suite_filter, test_filter = parse_filters(sys.argv[1:])
print(f'=== Test Run {RUN_ID} ===', flush=True)
if suite_filter:
print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True)
print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True)
print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True)
print(flush=True)
all_results = []
for suite_name, loader in SUITES.items():
if suite_filter and suite_name not in suite_filter:
continue
print(f'--- {suite_name} ---', flush=True)
tests = loader()
all_results.extend(run_suite(suite_name, tests, test_filter))
print(flush=True)
# Summary
passed = sum(1 for r in all_results if r.status == 'pass')
failed = sum(1 for r in all_results if r.status in ('fail', 'error'))
total_ms = sum(r.duration_ms for r in all_results)
print(f'=== {passed} passed, {failed} failed, {len(all_results)} total ({total_ms:.0f}ms) ===', flush=True)
if RESULTS_ENDPOINT:
summary = TestResult(
run_id=RUN_ID, test='__summary__', suite='summary',
status='pass' if failed == 0 else 'fail',
duration_ms=total_ms,
error=f'{passed} passed, {failed} failed',
)
post_result(summary)
sys.exit(1 if failed else 0)
if __name__ == '__main__':
main()