23
23
from typing import Any
24
24
from typing import Dict
25
25
from typing import List
26
+ from typing import Optional
26
27
from typing import Tuple
27
28
28
29
import click
29
30
from click .testing import CliRunner
30
31
from google .adk .cli import cli_tools_click
31
32
from google .adk .evaluation import local_eval_set_results_manager
33
+ from google .adk .sessions import Session
34
+ from pydantic import BaseModel
32
35
import pytest
33
36
34
37
35
38
# Helpers
36
- class _Recorder :
39
+ class _Recorder ( BaseModel ) :
37
40
"""Callable that records every invocation."""
38
41
39
- def __init__ (self ) -> None :
40
- self .calls : List [Tuple [Tuple [Any , ...], Dict [str , Any ]]] = []
42
+ calls : List [Tuple [Tuple [Any , ...], Dict [str , Any ]]] = []
41
43
42
44
def __call__ (self , * args : Any , ** kwargs : Any ) -> None : # noqa: D401
43
45
self .calls .append ((args , kwargs ))
@@ -254,30 +256,23 @@ class _EvalMetric:
254
256
def __init__ (self , metric_name : str , threshold : float ) -> None :
255
257
...
256
258
257
- class _EvalCaseResult :
259
+ class _EvalCaseResult (BaseModel ):
260
+ eval_set_id : str
261
+ eval_id : str
262
+ final_eval_status : Any
263
+ user_id : str
264
+ session_id : str
265
+ session_details : Optional [Session ] = None
266
+ eval_metric_results : list = {}
267
+ overall_eval_metric_results : list = {}
268
+ eval_metric_result_per_invocation : list = {}
258
269
259
- def __init__ (
260
- self ,
261
- eval_set_id : str ,
262
- final_eval_status : str ,
263
- user_id : str ,
264
- session_id : str ,
265
- ) -> None :
266
- self .eval_set_id = eval_set_id
267
- self .final_eval_status = final_eval_status
268
- self .user_id = user_id
269
- self .session_id = session_id
270
+ class EvalCase (BaseModel ):
271
+ eval_id : str
270
272
271
- class EvalCase :
272
-
273
- def __init__ (self , eval_id : str ):
274
- self .eval_id = eval_id
275
-
276
- class EvalSet :
277
-
278
- def __init__ (self , eval_set_id : str , eval_cases : list [EvalCase ]):
279
- self .eval_set_id = eval_set_id
280
- self .eval_cases = eval_cases
273
+ class EvalSet (BaseModel ):
274
+ eval_set_id : str
275
+ eval_cases : list [EvalCase ]
281
276
282
277
def mock_save_eval_set_result (cls , * args , ** kwargs ):
283
278
return None
@@ -302,13 +297,38 @@ def mock_save_eval_set_result(cls, *args, **kwargs):
302
297
stub .try_get_reset_func = lambda _p : None
303
298
stub .parse_and_get_evals_to_run = lambda _paths : {"set1.json" : ["e1" , "e2" ]}
304
299
eval_sets_manager_stub .load_eval_set_from_file = lambda x , y : EvalSet (
305
- "test_eval_set_id" , [EvalCase ("e1" ), EvalCase ("e2" )]
300
+ eval_set_id = "test_eval_set_id" ,
301
+ eval_cases = [EvalCase (eval_id = "e1" ), EvalCase (eval_id = "e2" )],
306
302
)
307
303
308
304
# Create an async generator function for run_evals
309
305
async def mock_run_evals (* _a , ** _k ):
310
- yield _EvalCaseResult ("set1.json" , "PASSED" , "user" , "session1" )
311
- yield _EvalCaseResult ("set1.json" , "FAILED" , "user" , "session2" )
306
+ yield _EvalCaseResult (
307
+ eval_set_id = "set1.json" ,
308
+ eval_id = "e1" ,
309
+ final_eval_status = _EvalStatus .PASSED ,
310
+ user_id = "user" ,
311
+ session_id = "session1" ,
312
+ overall_eval_metric_results = [{
313
+ "metricName" : "some_metric" ,
314
+ "threshold" : 0.0 ,
315
+ "score" : 1.0 ,
316
+ "evalStatus" : _EvalStatus .PASSED ,
317
+ }],
318
+ )
319
+ yield _EvalCaseResult (
320
+ eval_set_id = "set1.json" ,
321
+ eval_id = "e2" ,
322
+ final_eval_status = _EvalStatus .FAILED ,
323
+ user_id = "user" ,
324
+ session_id = "session2" ,
325
+ overall_eval_metric_results = [{
326
+ "metricName" : "some_metric" ,
327
+ "threshold" : 0.0 ,
328
+ "score" : 0.0 ,
329
+ "evalStatus" : _EvalStatus .FAILED ,
330
+ }],
331
+ )
312
332
313
333
stub .run_evals = mock_run_evals
314
334
@@ -324,9 +344,11 @@ def mock_asyncio_run(coro):
324
344
monkeypatch .setattr (cli_tools_click .asyncio , "run" , mock_asyncio_run )
325
345
326
346
# inject stub
327
- sys .modules ["google.adk.cli.cli_eval" ] = stub
328
- sys .modules ["google.adk.evaluation.local_eval_sets_manager" ] = (
329
- eval_sets_manager_stub
347
+ monkeypatch .setitem (sys .modules , "google.adk.cli.cli_eval" , stub )
348
+ monkeypatch .setitem (
349
+ sys .modules ,
350
+ "google.adk.evaluation.local_eval_sets_manager" ,
351
+ eval_sets_manager_stub ,
330
352
)
331
353
332
354
# create dummy agent directory
0 commit comments