10BC0 feat(conformance): Implements `adk conformance test` cli with replay … · codenamenam/adk-python@e86647d · GitHub
[go: up one dir, main page]

Skip to content

Commit e86647d

Browse files
Jacksunweicopybara-github
authored andcommitted
feat(conformance): Implements adk conformance test cli with replay mode
PiperOrigin-RevId: 808633566
1 parent c9ea80a commit e86647d

File tree

4 files changed

+671
-0
lines changed

4 files changed

+671
-0
lines changed

src/google/adk/cli/cli_tools_click.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,99 @@ def cli_conformance_create(
179179
asyncio.run(run_conformance_create(test_paths))
180180

181181

182+
@conformance.command("test", cls=HelpfulCommand)
183+
@click.argument(
184+
"paths",
185+
nargs=-1,
186+
type=click.Path(
187+
exists=True, file_okay=False, dir_okay=True, resolve_path=True
188+
),
189+
)
190+
@click.option(
191+
"--mode",
192+
type=click.Choice(["replay", "live"], case_sensitive=False),
193+
default="replay",
194+
show_default=True,
195+
help=(
196+
"Test mode: 'replay' verifies against recorded interactions, 'live'"
197+
" runs evaluation-based verification."
198+
),
199+
)
200+
@click.pass_context
201+
def cli_conformance_test(
202+
ctx,
203+
paths: tuple[str, ...],
204+
mode: str,
205+
):
206+
"""Run conformance tests to verify agent behavior consistency.
207+
208+
Validates that agents produce consistent outputs by comparing against recorded
209+
interactions or evaluating live execution results.
210+
211+
PATHS can be any number of folder paths. Each folder can either:
212+
- Contain a spec.yaml file directly (single test case)
213+
- Contain subdirectories with spec.yaml files (multiple test cases)
214+
215+
If no paths are provided, defaults to searching the 'tests' folder.
216+
217+
TEST MODES:
218+
219+
\b
220+
replay : Verifies agent interactions match previously recorded behaviors
221+
exactly. Compares LLM requests/responses and tool calls/results.
222+
live : Runs evaluation-based verification (not yet implemented)
223+
224+
DIRECTORY STRUCTURE:
225+
226+
Test cases must follow this structure:
227+
228+
\b
229+
category/
230+
test_name/
231+
spec.yaml # Test specification
232+
generated-recordings.yaml # Recorded interactions (replay mode)
233+
generated-session.yaml # Session data (replay mode)
234+
235+
EXAMPLES:
236+
237+
\b
238+
# Run all tests in current directory's 'tests' folder
239+
adk conformance test
240+
241+
\b
242+
# Run tests from specific folders
243+
adk conformance test tests/core tests/tools
244+
245+
\b
246+
# Run a single test case
247+
adk conformance test tests/core/description_001
248+
249+
\b
250+
# Run in live mode (when available)
251+
adk conformance test --mode=live tests/core
252+
"""
253+
254+
try:
255+
from .conformance.cli_test import run_conformance_test
256+
except ImportError as e:
257+
click.secho(
258+
f"Error: Missing conformance testing dependencies: {e}",
259+
fg="red",
260+
err=True,
261+
)
262+
click.secho(
263+
"Please install the required conformance testing package dependencies.",
264+
fg="yellow",
265+
err=True,
266+
)
267+
ctx.exit(1)
268+
269+
# Convert to Path objects, use default if empty (paths are already resolved by Click)
270+
test_paths = [Path(p) for p in paths] if paths else [Path("tests").resolve()]
271+
272+
asyncio.run(run_conformance_test(test_paths=test_paths, mode=mode.lower()))
273+
274+
182275
@main.command("create", cls=HelpfulCommand)
183276
@click.option(
184277
"--model",
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Loading utilities for conformance testing."""
16+
17+
from __future__ import annotations
18+
19+
from pathlib import Path
20+
from typing import Any
21+
from typing import Optional
22+
23+
import click
24+
import yaml
25+
26+
from ...sessions.session import Session
27+
from .test_case import TestSpec
28+
29+
30+
def load_test_case(test_case_dir: Path) -> TestSpec:
31+
"""Load TestSpec from spec.yaml file."""
32+
spec_file = test_case_dir / "spec.yaml"
33+
with open(spec_file, "r", encoding="utf-8") as f:
34+
data: dict[str, Any] = yaml.safe_load(f)
35+
return TestSpec.model_validate(data)
36+
37+
38+
def load_recorded_session(test_case_dir: Path) -> Optional[Session]:
39+
"""Load recorded session data from generated-session.yaml file."""
40+
session_file = test_case_dir / "generated-session.yaml"
41+
if not session_file.exists():
42+
return None
43+
44+
with open(session_file, "r", encoding="utf-8") as f:
45+
session_data = yaml.safe_load(f)
46+
if not session_data:
47+
return None
48+
49+
try:
50+
return Session.model_validate(session_data)
51+
except Exception as e:
52+
click.secho(
53+
f"Warning: Failed to parse session data: {e}", fg="yellow", err=True
54+
)
55+
return None
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Validation logic for conformance test replay mode."""
16+
17+
from __future__ import annotations
18+
19+
from dataclasses import dataclass
20+
import difflib
21+
import json
22+
from typing import Optional
23+
24+
from ...events.event import Event
25+
from ...sessions.session import Session
26+
27+
28+
@dataclass
29+
class ComparisonResult:
30+
"""Result of comparing two objects during conformance testing."""
31+
32+
success: bool
33+
error_message: Optional[str] = None
34+
35+
36+
def _generate_mismatch_message(
37+
context: str, actual_value: str, recorded_value: str
38+
) -> str:
39+
"""Generate a generic mismatch error message."""
40+
return (
41+
f"{context} mismatch - \nActual: \n{actual_value} \nRecorded:"
42+
f" \n{recorded_value}"
43+
)
44+
45+
46+
def _generate_diff_message(
47+
context: str, actual_dict: dict, recorded_dict: dict
48+
) -> str:
49+
"""Generate a diff-based error message for comparison failures."""
50+
# Convert to pretty-printed JSON for better readability
51+
actual_json = json.dumps(actual_dict, indent=2, sort_keys=True)
52+
recorded_json = json.dumps(recorded_dict, indent=2, sort_keys=True)
53+
54+
# Generate unified diff
55+
diff_lines = list(
56+
difflib.unified_diff(
57+
recorded_json.splitlines(keepends=True),
58+
actual_json.splitlines(keepends=True),
59+
fromfile=f"recorded {context}\n",
60+
tofile=f"actual {context}\n",
61+
lineterm="",
62+
)
63+
)
64+
65+
if diff_lines:
66+
return f"{context} mismatch:\n" + "".join(diff_lines)
67+
else:
68+
# Fallback to generic format if diff doesn't work
69+
return _generate_mismatch_message(context, actual_json, recorded_json)
70+
71+
72+
def compare_event(
73+
actual_event: Event, recorded_event: Event, index: int
74+
) -> ComparisonResult:
75+
"""Compare a single actual event with a recorded event."""
76+
# Comprehensive exclude dict for all fields that can differ between runs
77+
excluded_fields = {
78+
# Event-level fields that vary per run
79+
"id": True,
80+
"timestamp": True,
81+
"invocation_id": True,
82+
"long_running_tool_ids": True,
83+
# Content fields that vary per run
84+
"content": {
85+
"parts": {
86+
"__all__": {
87+
"thought_signature": True,
88+
"function_call": {"id": True},
89+
"function_response": {"id": True},
90+
}
91+
}
92+
},
93+
# Action fields that vary per run
94+
"actions": {
95+
"state_delta": {
96+
"_adk_recordings_config": True,
97+
"_adk_replay_config": True,
98+
},
99+
"requested_auth_configs": True,
100+
"requested_tool_confirmations": True,
101+
},
102+
}
103+
104+
# Compare events using model dumps with comprehensive exclude dict
105+
actual_dict = actual_event.model_dump(
106+
exclude_none=True, exclude=excluded_fields
107+
)
108+
recorded_dict = recorded_event.model_dump(
109+
exclude_none=True, exclude=excluded_fields
110+
)
111+
112+
if actual_dict != recorded_dict:
113+
return ComparisonResult(
114+
success=False,
115+
error_message=_generate_diff_message(
116+
f"event {index}", actual_dict, recorded_dict
117+
),
118+
)
119+
120+
return ComparisonResult(success=True)
121+
122+
123+
def compare_events(
124+
actual_events: list[Event], recorded_events: list[Event]
125+
) -> ComparisonResult:
126+
"""Compare actual events with recorded events."""
127+
if len(actual_events) != len(recorded_events):
128+
return ComparisonResult(
129+
success=False,
130+
error_message=_generate_mismatch_message(
131+
"Event count", str(len(actual_events)), str(len(recorded_events))
132+
),
133+
)
134+
135+
for i, (actual, recorded) in enumerate(zip(actual_events, recorded_events)):
136+
result = compare_event(actual, recorded, i)
137+
if not result.success:
138+
return result
139+
140+
return ComparisonResult(success=True)
141+
142+
143+
def compare_session(
144+
actual_session: Session, recorded_session: Session
145+
) -> ComparisonResult:
146+
"""Compare actual session with recorded session using comprehensive exclude list.
147+
148+
Returns:
149+
ComparisonResult with success status and optional error message
150+
"""
151+
# Comprehensive exclude dict for all fields that can differ between runs
152+
excluded_fields = {
153+
# Session-level fields that vary per run
154+
"id": True,
155+
"last_update_time": True,
156+
# State fields that contain ADK internal configuration
157+
"state": {
158+
"_adk_recordings_config": True,
159+
"_adk_replay_config": True,
160+
},
161+
# Events comparison handled separately
162+
"events": True,
163+
}
164+
165+
# Compare sessions using model dumps with comprehensive exclude dict
166+
actual_dict = actual_session.model_dump(
167+
exclude_none=True, exclude=excluded_fields
168+
)
169+
recorded_dict = recorded_session.model_dump(
170+
exclude_none=True, exclude=excluded_fields
171+
)
172+
173+
if actual_dict != recorded_dict:
174+
return ComparisonResult(
175+
success=False,
176+
error_message=_generate_diff_message(
177+
"session", actual_dict, recorded_dict
178+
),
179+
)
180+
181+
return ComparisonResult(success=True)

0 commit comments

Comments
 (0)
0