agent-runtime/tests/run_tests.py

#!/usr/bin/env python3
"""
Test orchestrator — runs test suites and posts results to dev assay.

Usage:
  python tests/run_all_tests.py                          # all suites
  python tests/run_all_tests.py api                      # one suite
  python tests/run_all_tests.py roundtrip                # one suite
  python tests/run_all_tests.py api/health               # single test
  python tests/run_all_tests.py roundtrip/full_eras      # single test
  python tests/run_all_tests.py api/health roundtrip/full_chat  # multiple tests

Test names: suite/name (without the suite prefix in the test registry).
  engine tests:    graph_load, node_instantiation, edge_types_complete,
                   condition_reflex, condition_tool_output,
                   frame_trace_reflex, frame_trace_expert, frame_trace_expert_with_interpreter
  api tests:       health, eras_umsatz_api, eras_umsatz_artifact
  matrix tests:    eras_query[variant], eras_artifact[variant], social_reflex[variant]
                   variants: gemini-flash, haiku, gpt-4o-mini
  roundtrip tests: nyx_loads, inject_artifact, inject_message, full_chat, full_eras
"""

import json
import os
import sys
import time
import urllib.request
import uuid
from datetime import datetime, timezone
from dataclasses import dataclass, field, asdict

RESULTS_ENDPOINT = os.environ.get('RESULTS_ENDPOINT', '')
RUN_ID = os.environ.get('RUN_ID', str(uuid.uuid4())[:8])


def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


@dataclass
class TestResult:
    run_id: str
    test: str
    suite: str
    status: str  # 'pass', 'fail', 'running', 'error'
    duration_ms: float = 0
    error: str = ''
    ts: str = ''


def post_result(result: TestResult):
    """Post a single test result to the dev assay endpoint."""
    print(json.dumps(asdict(result)), flush=True)
    if not RESULTS_ENDPOINT:
        return
    try:
        payload = json.dumps(asdict(result)).encode()
        req = urllib.request.Request(
            RESULTS_ENDPOINT,
            data=payload,
            headers={'Content-Type': 'application/json'},
        )
        urllib.request.urlopen(req, timeout=5)
    except Exception as e:
        print(f'  [warn] failed to post result: {e}', file=sys.stderr)


def run_test(name: str, suite: str, fn) -> TestResult:
    """Run a single test function and return the result."""
    result = TestResult(run_id=RUN_ID, test=name, suite=suite, status='running', ts=_now_iso())
    post_result(result)

    start = time.time()
    try:
        fn()
        result.status = 'pass'
    except AssertionError as e:
        result.status = 'fail'
        result.error = str(e)
    except Exception as e:
        result.status = 'error'
        result.error = f'{type(e).__name__}: {e}'
    result.duration_ms = round((time.time() - start) * 1000)
    result.ts = _now_iso()

    post_result(result)
    return result


def get_api_tests() -> dict:
    """Load API tests from e2e_harness.py."""
    sys.path.insert(0, os.path.dirname(__file__))
    import e2e_harness
    e2e_harness.ASSAY_BASE = os.environ.get('ASSAY_API', 'http://assay-runtime-test:8000').rstrip('/api')
    # Skip browser-dependent tests
    return {k: v for k, v in e2e_harness.TESTS.items() if 'takeover' not in k and 'panes' not in k}


def get_roundtrip_tests() -> dict:
    """Load Playwright roundtrip tests."""
    sys.path.insert(0, os.path.dirname(__file__))
    from test_roundtrip import TESTS
    return TESTS


def get_engine_tests() -> dict:
    """Load engine-level tests (no LLM, no network)."""
    sys.path.insert(0, os.path.dirname(__file__))
    from test_engine import TESTS
    return TESTS


def get_matrix_tests() -> dict:
    """Load model matrix tests (real LLM calls, test×variant combos)."""
    sys.path.insert(0, os.path.dirname(__file__))
    from test_matrix import get_matrix_tests
    return get_matrix_tests()


SUITES = {
    'engine': get_engine_tests,
    'api': get_api_tests,
    'matrix': get_matrix_tests,
    'roundtrip': get_roundtrip_tests,
}


def parse_filters(args: list[str]) -> tuple[set[str] | None, set[str]]:
    """Parse CLI args into (suite_filter, test_filter).

    Returns:
        suite_filter: set of suite names, or None for all suites
        test_filter: set of 'suite/test' names (empty = run all in suite)
    """
    if not args:
        return None, set()

    suites = set()
    tests = set()
    for arg in args:
        if '/' in arg:
            tests.add(arg)
            suites.add(arg.split('/')[0])
        else:
            suites.add(arg)
    return suites, tests


def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestResult]:
    """Run tests from a suite, optionally filtered."""
    results = []
    for name, fn in tests.items():
        # Apply test filter if specified
        full_name = f'{suite_name}/{name}'
        # Strip suite prefix for matching (roundtrip/full_eras matches roundtrip_full_eras)
        short_name = name.replace(f'{suite_name}_', '')
        if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter:
            continue

        r = run_test(name, suite_name, fn)
        results.append(r)
        status = 'PASS' if r.status == 'pass' else 'FAIL'
        print(f'  [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
        if r.error:
            print(f'           {r.error[:200]}', flush=True)
    return results


def main():
    suite_filter, test_filter = parse_filters(sys.argv[1:])

    print(f'=== Test Run {RUN_ID} ===', flush=True)
    if suite_filter:
        print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True)
    print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True)
    print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True)
    print(flush=True)

    all_results = []

    for suite_name, loader in SUITES.items():
        if suite_filter and suite_name not in suite_filter:
            continue
        print(f'--- {suite_name} ---', flush=True)
        tests = loader()
        all_results.extend(run_suite(suite_name, tests, test_filter))
        print(flush=True)

    # Summary
    passed = sum(1 for r in all_results if r.status == 'pass')
    failed = sum(1 for r in all_results if r.status in ('fail', 'error'))
    total_ms = sum(r.duration_ms for r in all_results)
    print(f'=== {passed} passed, {failed} failed, {len(all_results)} total ({total_ms:.0f}ms) ===', flush=True)

    if RESULTS_ENDPOINT:
        summary = TestResult(
            run_id=RUN_ID, test='__summary__', suite='summary',
            status='pass' if failed == 0 else 'fail',
            duration_ms=total_ms,
            error=f'{passed} passed, {failed} failed',
        )
        post_result(summary)

    sys.exit(1 if failed else 0)


if __name__ == '__main__':
    main()