Add engine test suite: 8 tests for graph loading, conditions, frame traces

New 'engine' suite in run_tests.py with tests that verify frame engine mechanics without LLM calls. Covers graph loading, node instantiation, edge type completeness, reflex/tool_output conditions, and frame trace structure for reflex/expert/expert+interpreter pipelines. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 18:01:06 +02:00 · 2026-04-03 18:01:06 +02:00 · 097c7f31f3
commit 097c7f31f3
parent 1e64b0a58c
2 changed files with 689 additions and 0 deletions
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@ -0,0 +1,198 @@
 #!/usr/bin/env python3
 """
 Test orchestrator — runs test suites and posts results to dev assay.
 Usage:
  python tests/run_all_tests.py                          # all suites
  python tests/run_all_tests.py api                      # one suite
  python tests/run_all_tests.py roundtrip                # one suite
  python tests/run_all_tests.py api/health               # single test
  python tests/run_all_tests.py roundtrip/full_eras      # single test
  python tests/run_all_tests.py api/health roundtrip/full_chat  # multiple tests
 Test names: suite/name (without the suite prefix in the test registry).
  engine tests:    graph_load, node_instantiation, edge_types_complete,
                   condition_reflex, condition_tool_output,
                   frame_trace_reflex, frame_trace_expert, frame_trace_expert_with_interpreter
  api tests:       health, eras_umsatz_api, eras_umsatz_artifact
  roundtrip tests: nyx_loads, inject_artifact, inject_message, full_chat, full_eras
 """
 import json
 import os
 import sys
 import time
 import urllib.request
 import uuid
 from datetime import datetime, timezone
 from dataclasses import dataclass, field, asdict
 RESULTS_ENDPOINT = os.environ.get('RESULTS_ENDPOINT', '')
 RUN_ID = os.environ.get('RUN_ID', str(uuid.uuid4())[:8])
 def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
@dataclass
 class TestResult:
    run_id: str
    test: str
    suite: str
    status: str  # 'pass', 'fail', 'running', 'error'
    duration_ms: float = 0
    error: str = ''
    ts: str = ''
 def post_result(result: TestResult):
    """Post a single test result to the dev assay endpoint."""
    print(json.dumps(asdict(result)), flush=True)
    if not RESULTS_ENDPOINT:
        return
    try:
        payload = json.dumps(asdict(result)).encode()
        req = urllib.request.Request(
            RESULTS_ENDPOINT,
            data=payload,
            headers={'Content-Type': 'application/json'},
        )
        urllib.request.urlopen(req, timeout=5)
    except Exception as e:
        print(f'  [warn] failed to post result: {e}', file=sys.stderr)
 def run_test(name: str, suite: str, fn) -> TestResult:
    """Run a single test function and return the result."""
    result = TestResult(run_id=RUN_ID, test=name, suite=suite, status='running', ts=_now_iso())
    post_result(result)
    start = time.time()
    try:
        fn()
        result.status = 'pass'
    except AssertionError as e:
        result.status = 'fail'
        result.error = str(e)
    except Exception as e:
        result.status = 'error'
        result.error = f'{type(e).__name__}: {e}'
    result.duration_ms = round((time.time() - start) * 1000)
    result.ts = _now_iso()
    post_result(result)
    return result
 def get_api_tests() -> dict:
    """Load API tests from e2e_harness.py."""
    sys.path.insert(0, os.path.dirname(__file__))
    import e2e_harness
    e2e_harness.ASSAY_BASE = os.environ.get('ASSAY_API', 'http://assay-runtime-test:8000').rstrip('/api')
    # Skip browser-dependent tests
    return {k: v for k, v in e2e_harness.TESTS.items() if 'takeover' not in k and 'panes' not in k}
 def get_roundtrip_tests() -> dict:
    """Load Playwright roundtrip tests."""
    sys.path.insert(0, os.path.dirname(__file__))
    from test_roundtrip import TESTS
    return TESTS
 def get_engine_tests() -> dict:
    """Load engine-level tests (no LLM, no network)."""
    sys.path.insert(0, os.path.dirname(__file__))
    from test_engine import TESTS
    return TESTS
 SUITES = {
    'engine': get_engine_tests,
    'api': get_api_tests,
    'roundtrip': get_roundtrip_tests,
 }
 def parse_filters(args: list[str]) -> tuple[set[str] | None, set[str]]:
    """Parse CLI args into (suite_filter, test_filter).
    Returns:
        suite_filter: set of suite names, or None for all suites
        test_filter: set of 'suite/test' names (empty = run all in suite)
    """
    if not args:
        return None, set()
    suites = set()
    tests = set()
    for arg in args:
        if '/' in arg:
            tests.add(arg)
            suites.add(arg.split('/')[0])
        else:
            suites.add(arg)
    return suites, tests
 def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestResult]:
    """Run tests from a suite, optionally filtered."""
    results = []
    for name, fn in tests.items():
        # Apply test filter if specified
        full_name = f'{suite_name}/{name}'
        # Strip suite prefix for matching (roundtrip/full_eras matches roundtrip_full_eras)
        short_name = name.replace(f'{suite_name}_', '')
        if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter:
            continue
        r = run_test(name, suite_name, fn)
        results.append(r)
        status = 'PASS' if r.status == 'pass' else 'FAIL'
        print(f'  [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
        if r.error:
            print(f'           {r.error[:200]}', flush=True)
    return results
 def main():
    suite_filter, test_filter = parse_filters(sys.argv[1:])
    print(f'=== Test Run {RUN_ID} ===', flush=True)
    if suite_filter:
        print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True)
    print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True)
    print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True)
    print(flush=True)
    all_results = []
    for suite_name, loader in SUITES.items():
        if suite_filter and suite_name not in suite_filter:
            continue
        print(f'--- {suite_name} ---', flush=True)
        tests = loader()
        all_results.extend(run_suite(suite_name, tests, test_filter))
        print(flush=True)
    # Summary
    passed = sum(1 for r in all_results if r.status == 'pass')
    failed = sum(1 for r in all_results if r.status in ('fail', 'error'))
    total_ms = sum(r.duration_ms for r in all_results)
    print(f'=== {passed} passed, {failed} failed, {len(all_results)} total ({total_ms:.0f}ms) ===', flush=True)
    if RESULTS_ENDPOINT:
        summary = TestResult(
            run_id=RUN_ID, test='__summary__', suite='summary',
            status='pass' if failed == 0 else 'fail',
            duration_ms=total_ms,
            error=f'{passed} passed, {failed} failed',
        )
        post_result(summary)
    sys.exit(1 if failed else 0)
 if __name__ == '__main__':
    main()
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@ -0,0 +1,491 @@
 """Engine test suite — tests graph loading, node instantiation, frame engine
 routing, conditions, and trace structure. No LLM calls — all nodes mocked.
 Tests:
  graph_load           — load_graph returns correct structure for all graphs
  node_instantiation   — instantiate_nodes creates all roles from registry
  edge_types_complete  — all 3 edge types present, no orphan nodes
  condition_reflex     — reflex condition fires on social+trivial only
  condition_tool_output — has_tool_output condition fires when tool data present
  frame_trace_reflex   — reflex path produces 2-frame trace
  frame_trace_expert   — expert path produces correct frame sequence
  frame_trace_director — director path produces correct frame sequence
 """
 import asyncio
 import os
 import sys
 import time
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
 from agent.engine import load_graph, instantiate_nodes, _graph_from_module
 from agent.frame_engine import FrameEngine, FrameTrace, FrameRecord
 from agent.types import (
    Envelope, Command, InputAnalysis, ThoughtResult,
    DirectorPlan, PARouting, InterpretedResult, Artifact,
 )
 # --- Helpers ---
 class MockSink:
    """Captures streamed output."""
    def __init__(self):
        self.deltas = []
        self.controls = []
        self.artifacts = []
        self.done_count = 0
    async def send_delta(self, text):
        self.deltas.append(text)
    async def send_controls(self, controls):
        self.controls = controls
    async def send_artifacts(self, artifacts):
        self.artifacts = artifacts
    async def send_done(self):
        self.done_count += 1
    def reset(self):
        self.deltas.clear()
 class MockHud:
    """Captures HUD events."""
    def __init__(self):
        self.events = []
    async def __call__(self, data):
        self.events.append(data)
    def find(self, event):
        return [e for e in self.events if e.get("event") == event]
 class MockMemorizer:
    """Minimal memorizer for frame engine."""
    def __init__(self):
        self.state = {
            "user_name": "test",
            "user_mood": "neutral",
            "topic": "testing",
            "topic_history": [],
            "language": "en",
            "style_hint": "casual",
            "facts": [],
            "user_expectation": "conversational",
        }
    def get_context_block(self, sensor_lines=None, ui_state=None):
        return "Memory: test context"
    async def update(self, history):
        pass
 class MockSensor:
    """Minimal sensor for frame engine."""
    def __init__(self):
        self._flags = []
    def note_user_activity(self):
        pass
    def update_browser_dashboard(self, dashboard):
        pass
    def get_context_lines(self):
        return ["Sensors: test"]
    def consume_flags(self):
        flags = self._flags[:]
        self._flags.clear()
        return flags
 class MockUINode:
    """Minimal UI node for frame engine."""
    def __init__(self):
        self.thinker_controls = []
        self.state = {}
        self._artifacts = []
    @property
    def current_controls(self):
        return self.thinker_controls
    @current_controls.setter
    def current_controls(self, value):
        self.thinker_controls = value
    async def process(self, thought, history, memory_context=""):
        return self.thinker_controls
    def get_machine_summary(self):
        return ""
    def get_machine_controls(self):
        return []
    def get_artifacts(self):
        return self._artifacts
    def try_machine_transition(self, action):
        return False, ""
    async def process_local_action(self, action, data):
        return None, []
 class MockInputNode:
    """Returns a preconfigured Command."""
    def __init__(self, intent="request", complexity="simple", topic="test", language="en"):
        self._intent = intent
        self._complexity = complexity
        self._topic = topic
        self._language = language
    async def process(self, envelope, history, memory_context="", identity="", channel=""):
        return Command(
            analysis=InputAnalysis(
                intent=self._intent, topic=self._topic,
                complexity=self._complexity, language=self._language,
                tone="casual",
            ),
            source_text=envelope.text,
        )
 class MockOutputNode:
    """Streams response text via sink."""
    async def process(self, thought, history, sink, memory_context=""):
        text = thought.response or "ok"
        for i in range(0, len(text), 12):
            await sink.send_delta(text[i:i+12])
        await sink.send_done()
        return text
 class MockPANode:
    """Returns a preconfigured PARouting."""
    def __init__(self, expert="eras", job="test query", thinking_msg="Working..."):
        self._expert = expert
        self._job = job
        self._thinking_msg = thinking_msg
    def set_available_experts(self, experts):
        pass
    async def route(self, command, history, memory_context="", identity="", channel=""):
        return PARouting(
            expert=self._expert,
            job=self._job,
            thinking_message=self._thinking_msg,
            language="en",
        )
    async def route_retry(self, command, history, memory_context="", identity="",
                          channel="", original_job="", errors=None):
        return PARouting(expert=self._expert, job=f"retry: {self._job}", language="en")
 class MockExpertNode:
    """Returns a preconfigured ThoughtResult."""
    def __init__(self, response="expert result", tool_used="", tool_output="", errors=None):
        self._response = response
        self._tool_used = tool_used
        self._tool_output = tool_output
        self._errors = errors or []
        self.send_hud = MockHud()
    async def execute(self, job, language):
        return ThoughtResult(
            response=self._response,
            tool_used=self._tool_used,
            tool_output=self._tool_output,
            errors=self._errors,
        )
 class MockDirectorNode:
    """Returns a preconfigured DirectorPlan."""
    def __init__(self, goal="test", tools=None, hint=""):
        self._goal = goal
        self._tools = tools or []
        self._hint = hint
    async def decide(self, command, history, memory_context=""):
        return DirectorPlan(
            goal=self._goal,
            tool_sequence=self._tools,
            response_hint=self._hint,
        )
    def get_context_line(self):
        return ""
 class MockThinkerNode:
    """Returns a preconfigured ThoughtResult."""
    def __init__(self, response="thought result", tool_used="", tool_output=""):
        self._response = response
        self._tool_used = tool_used
        self._tool_output = tool_output
    async def process(self, command, plan=None, history=None, memory_context=""):
        return ThoughtResult(
            response=self._response,
            tool_used=self._tool_used,
            tool_output=self._tool_output,
        )
 class MockInterpreterNode:
    """Returns a preconfigured InterpretedResult."""
    async def interpret(self, tool_used, tool_output, job):
        return InterpretedResult(
            summary=f"Interpreted: {tool_used} returned data",
            row_count=5,
            key_facts=["5 rows"],
        )
 def make_frame_engine(nodes, graph_name="v4-eras"):
    """Create a FrameEngine with mocked dependencies."""
    graph = load_graph(graph_name)
    sink = MockSink()
    hud = MockHud()
    memorizer = MockMemorizer()
    sensor = MockSensor()
    ui = MockUINode()
    engine = FrameEngine(
        graph=graph,
        nodes=nodes,
        sink=sink,
        history=[],
        send_hud=hud,
        sensor=sensor,
        memorizer=memorizer,
        ui_node=ui,
        identity="test_user",
        channel="test",
    )
    return engine, sink, hud
 # --- Tests ---
 def test_graph_load():
    """load_graph returns correct structure for all frame-based graphs."""
    for name in ["v3-framed", "v4-eras"]:
        g = load_graph(name)
        assert g["name"] == name, f"graph name mismatch: {g['name']} != {name}"
        assert g["engine"] == "frames", f"{name} should use frames engine"
        assert "nodes" in g and len(g["nodes"]) > 0, f"{name} has no nodes"
        assert "edges" in g and len(g["edges"]) > 0, f"{name} has no edges"
        assert "conditions" in g, f"{name} has no conditions"
    # v1 should be imperative
    g1 = load_graph("v1-current")
    assert g1["engine"] == "imperative", "v1 should be imperative"
 def test_node_instantiation():
    """instantiate_nodes creates all roles from registry."""
    hud = MockHud()
    for name in ["v3-framed", "v4-eras"]:
        g = load_graph(name)
        nodes = instantiate_nodes(g, hud)
        for role in g["nodes"]:
            assert role in nodes, f"missing node role '{role}' in {name}"
        # Check specific node types exist
        assert "input" in nodes
        assert "output" in nodes
        assert "memorizer" in nodes
        assert "sensor" in nodes
 def test_edge_types_complete():
    """All 3 edge types present in graph definitions, no orphan nodes."""
    for name in ["v3-framed", "v4-eras"]:
        g = load_graph(name)
        edges = g["edges"]
        edge_types = {e.get("type") for e in edges}
        assert "data" in edge_types, f"{name} missing data edges"
        assert "context" in edge_types, f"{name} missing context edges"
        assert "state" in edge_types, f"{name} missing state edges"
        # Every node should appear in at least one edge (from or to)
        node_roles = set(g["nodes"].keys())
        edge_nodes = set()
        for e in edges:
            edge_nodes.add(e["from"])
            to = e["to"]
            if isinstance(to, list):
                edge_nodes.update(to)
            else:
                edge_nodes.add(to)
        # runtime is a virtual target, not a real node
        edge_nodes.discard("runtime")
        missing = node_roles - edge_nodes
        assert not missing, f"{name} has orphan nodes: {missing}"
 def test_condition_reflex():
    """_check_condition('reflex') fires on social+trivial only."""
    engine, _, _ = make_frame_engine({
        "input": MockInputNode(),
        "output": MockOutputNode(),
        "memorizer": MockMemorizer(),
        "sensor": MockSensor(),
        "ui": MockUINode(),
    }, "v4-eras")
    # Should fire
    cmd_social = Command(
        analysis=InputAnalysis(intent="social", complexity="trivial"),
        source_text="hi",
    )
    assert engine._check_condition("reflex", command=cmd_social), \
        "reflex should fire for social+trivial"
    # Should NOT fire
    cmd_request = Command(
        analysis=InputAnalysis(intent="request", complexity="simple"),
        source_text="show data",
    )
    assert not engine._check_condition("reflex", command=cmd_request), \
        "reflex should not fire for request+simple"
    cmd_social_complex = Command(
        analysis=InputAnalysis(intent="social", complexity="complex"),
        source_text="tell me a long story",
    )
    assert not engine._check_condition("reflex", command=cmd_social_complex), \
        "reflex should not fire for social+complex"
 def test_condition_tool_output():
    """_check_condition('has_tool_output') fires when tool data present."""
    engine, _, _ = make_frame_engine({
        "input": MockInputNode(),
        "output": MockOutputNode(),
        "memorizer": MockMemorizer(),
        "sensor": MockSensor(),
        "ui": MockUINode(),
    }, "v4-eras")
    thought_with = ThoughtResult(
        response="data", tool_used="query_db", tool_output="rows here",
    )
    assert engine._check_condition("has_tool_output", thought=thought_with), \
        "should fire when tool_used and tool_output both set"
    thought_without = ThoughtResult(response="just text")
    assert not engine._check_condition("has_tool_output", thought=thought_without), \
        "should not fire when no tool output"
    thought_partial = ThoughtResult(response="x", tool_used="query_db", tool_output="")
    assert not engine._check_condition("has_tool_output", thought=thought_partial), \
        "should not fire when tool_output is empty string"
 def test_frame_trace_reflex():
    """Reflex path: 2 frames (input → output), path='reflex'."""
    nodes = {
        "input": MockInputNode(intent="social", complexity="trivial"),
        "output": MockOutputNode(),
        "pa": MockPANode(),
        "expert_eras": MockExpertNode(),
        "interpreter": MockInterpreterNode(),
        "memorizer": MockMemorizer(),
        "sensor": MockSensor(),
        "ui": MockUINode(),
    }
    engine, sink, hud = make_frame_engine(nodes, "v4-eras")
    result = asyncio.get_event_loop().run_until_complete(
        engine.process_message("hello")
    )
    trace = result["trace"]
    assert trace["path"] == "reflex", f"expected reflex path, got {trace['path']}"
    assert trace["total_frames"] == 2, f"expected 2 frames, got {trace['total_frames']}"
    assert len(trace["frames"]) == 2
    assert trace["frames"][0]["node"] == "input"
    assert trace["frames"][1]["node"] == "output"
    assert "reflex=True" in trace["frames"][0]["condition"]
 def test_frame_trace_expert():
    """Expert path without tool output: F1(input)→F2(pa)→F3(expert)→F4(output+ui)."""
    nodes = {
        "input": MockInputNode(intent="request", complexity="simple"),
        "output": MockOutputNode(),
        "pa": MockPANode(expert="eras", job="get top customers"),
        "expert_eras": MockExpertNode(response="Here are the customers"),
        "interpreter": MockInterpreterNode(),
        "memorizer": MockMemorizer(),
        "sensor": MockSensor(),
        "ui": MockUINode(),
    }
    engine, sink, hud = make_frame_engine(nodes, "v4-eras")
    result = asyncio.get_event_loop().run_until_complete(
        engine.process_message("show top customers")
    )
    trace = result["trace"]
    assert trace["path"] == "expert", f"expected expert path, got {trace['path']}"
    assert trace["total_frames"] >= 4, f"expected >=4 frames, got {trace['total_frames']}"
    nodes_in_trace = [f["node"] for f in trace["frames"]]
    assert nodes_in_trace[0] == "input"
    assert nodes_in_trace[1] == "pa"
    assert "expert_eras" in nodes_in_trace[2]
 def test_frame_trace_expert_with_interpreter():
    """Expert path with tool output: includes interpreter frame, path='expert+interpreter'."""
    nodes = {
        "input": MockInputNode(intent="request", complexity="simple"),
        "output": MockOutputNode(),
        "pa": MockPANode(expert="eras", job="query customers"),
        "expert_eras": MockExpertNode(
            response="raw data",
            tool_used="query_db",
            tool_output="customer_name,revenue\nAcme,1000",
        ),
        "interpreter": MockInterpreterNode(),
        "memorizer": MockMemorizer(),
        "sensor": MockSensor(),
        "ui": MockUINode(),
    }
    engine, sink, hud = make_frame_engine(nodes, "v4-eras")
    result = asyncio.get_event_loop().run_until_complete(
        engine.process_message("show customer revenue")
    )
    trace = result["trace"]
    assert trace["path"] == "expert+interpreter", \
        f"expected expert+interpreter path, got {trace['path']}"
    nodes_in_trace = [f["node"] for f in trace["frames"]]
    assert "interpreter" in nodes_in_trace, "interpreter frame missing"
    assert trace["total_frames"] >= 5, f"expected >=5 frames, got {trace['total_frames']}"
 # --- Test registry (for run_tests.py) ---
 TESTS = {
    'graph_load': test_graph_load,
    'node_instantiation': test_node_instantiation,
    'edge_types_complete': test_edge_types_complete,
    'condition_reflex': test_condition_reflex,
    'condition_tool_output': test_condition_tool_output,
    'frame_trace_reflex': test_frame_trace_reflex,
    'frame_trace_expert': test_frame_trace_expert,
    'frame_trace_expert_with_interpreter': test_frame_trace_expert_with_interpreter,
 }