|
22 | 22 | from pathlib import Path
|
23 | 23 | import re
|
24 | 24 | import sys
|
| 25 | +import time |
25 | 26 | import traceback
|
26 | 27 | import typing
|
27 | 28 | from typing import Any
|
28 | 29 | from typing import List
|
29 | 30 | from typing import Literal
|
30 | 31 | from typing import Optional
|
31 |
| -from typing import Union |
32 | 32 |
|
33 | 33 | import click
|
34 | 34 | from fastapi import FastAPI
|
|
71 | 71 | from ..sessions.vertex_ai_session_service import VertexAiSessionService
|
72 | 72 | from ..tools.base_toolset import BaseToolset
|
73 | 73 | from .cli_eval import EVAL_SESSION_ID_PREFIX
|
| 74 | +from .cli_eval import EvalCaseResult |
74 | 75 | from .cli_eval import EvalMetric
|
75 | 76 | from .cli_eval import EvalMetricResult
|
| 77 | +from .cli_eval import EvalSetResult |
76 | 78 | from .cli_eval import EvalStatus
|
77 | 79 | from .utils import create_empty_state
|
78 | 80 | from .utils import envs
|
|
81 | 83 | logger = logging.getLogger(__name__)
|
82 | 84 |
|
83 | 85 | _EVAL_SET_FILE_EXTENSION = ".evalset.json"
|
| 86 | +_EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json" |
84 | 87 |
|
85 | 88 |
|
86 | 89 | class ApiServerSpanExporter(export.SpanExporter):
|
@@ -137,10 +140,12 @@ class RunEvalResult(BaseModel):
|
137 | 140 | populate_by_name=True,
|
138 | 141 | )
|
139 | 142 |
|
| 143 | + eval_set_file: str |
140 | 144 | eval_set_id: str
|
141 | 145 | eval_id: str
|
142 | 146 | final_eval_status: EvalStatus
|
143 | 147 | eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]]
|
| 148 | + user_id: str |
144 | 149 | session_id: str
|
145 | 150 |
|
146 | 151 |
|
@@ -484,24 +489,117 @@ async def run_eval(
|
484 | 489 | "Eval ids to run list is empty. We will all evals in the eval set."
|
485 | 490 | )
|
486 | 491 | root_agent = await _get_root_agent_async(app_name)
|
487 |
| - return [ |
488 |
| - RunEvalResult( |
489 |
| - app_name=app_name, |
490 |
| - eval_set_id=eval_set_id, |
491 |
| - eval_id=eval_result.eval_id, |
492 |
| - final_eval_status=eval_result.final_eval_status, |
493 |
| - eval_metric_results=eval_result.eval_metric_results, |
494 |
| - session_id=eval_result.session_id, |
495 |
| - ) |
496 |
| - async for eval_result in run_evals( |
497 |
| - eval_set_to_evals, |
498 |
| - root_agent, |
499 |
| - getattr(root_agent, "reset_data", None), |
500 |
| - req.eval_metrics, |
501 |
| - session_service=session_service, |
502 |
| - artifact_service=artifact_service, |
| 492 | + run_eval_results = [] |
| 493 | + eval_case_results = [] |
| 494 | + async for eval_result in run_evals( |
| 495 | + eval_set_to_evals, |
| 496 | + root_agent, |
| 497 | + getattr(root_agent, "reset_data", None), |
| 498 | + req.eval_metrics, |
| 499 | + session_service=session_service, |
| 500 | + artifact_service=artifact_service, |
| 501 | + ): |
| 502 | + run_eval_results.append( |
| 503 | + RunEvalResult( |
| 504 | + app_name=app_name, |
| 505 | + eval_set_file=eval_result.eval_set_file, |
| 506 | + eval_set_id=eval_set_id, |
| 507 | + eval_id=eval_result.eval_id, |
| 508 | + final_eval_status=eval_result.final_eval_status, |
| 509 | + eval_metric_results=eval_result.eval_metric_results, |
| 510 | + user_id=eval_result.user_id, |
| 511 | + session_id=eval_result.session_id, |
| 512 | + ) |
| 513 | + ) |
| 514 | + session = session_service.get_session( |
| 515 | + app_name=app_name, |
| 516 | + user_id=eval_result.user_id, |
| 517 | + session_id=eval_result.session_id, |
| 518 | + ) |
| 519 | + eval_case_results.append( |
| 520 | + EvalCaseResult( |
| 521 | + eval_set_file=eval_result.eval_set_file, |
| 522 | + eval_id=eval_result.eval_id, |
| 523 | + final_eval_status=eval_result.final_eval_status, |
| 524 | + eval_metric_results=eval_result.eval_metric_results, |
| 525 | + session_id=eval_result.session_id, |
| 526 | + session_details=session, |
| 527 | + user_id=eval_result.user_id, |
| 528 | + ) |
| 529 | + ) |
| 530 | + |
| 531 | + timestamp = time.time() |
| 532 | + eval_set_result_name = app_name + "_" + eval_set_id + "_" + str(timestamp) |
| 533 | + eval_set_result = EvalSetResult( |
| 534 | + eval_set_result_id=eval_set_result_name, |
| 535 | + eval_set_result_name=eval_set_result_name, |
| 536 | + eval_set_id=eval_set_id, |
| 537 | + eval_case_results=eval_case_results, |
| 538 | + creation_timestamp=timestamp, |
| 539 | + ) |
| 540 | + |
| 541 | + # Write eval result file, with eval_set_result_name. |
| 542 | + app_eval_history_dir = os.path.join( |
| 543 | + agent_dir, app_name, ".adk", "eval_history" |
| 544 | + ) |
| 545 | + if not os.path.exists(app_eval_history_dir): |
| 546 | + os.makedirs(app_eval_history_dir) |
| 547 | + # Convert to json and write to file. |
| 548 | + eval_set_result_json = eval_set_result.model_dump_json() |
| 549 | + eval_set_result_file_path = os.path.join( |
| 550 | + app_eval_history_dir, |
| 551 | + eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION, |
| 552 | + ) |
| 553 | + logger.info("Writing eval result to file: %s", eval_set_result_file_path) |
| 554 | + with open(eval_set_result_file_path, "w") as f: |
| 555 | + f.write(json.dumps(eval_set_result_json, indent=2)) |
| 556 | + |
| 557 | + return run_eval_results |
| 558 | + |
| 559 | + @app.get( |
| 560 | + "/apps/{app_name}/eval_results/{eval_result_id}", |
| 561 | + response_model_exclude_none=True, |
| 562 | + ) |
| 563 | + def get_eval_result( |
| 564 | + app_name: str, |
| 565 | + eval_result_id: str, |
| 566 | + ) -> EvalSetResult: |
| 567 | + """Gets the eval result for the given eval id.""" |
| 568 | + # Load the eval set file data |
| 569 | + maybe_eval_result_file_path = ( |
| 570 | + os.path.join( |
| 571 | + agent_dir, app_name, ".adk", "eval_history", eval_result_id |
503 | 572 | )
|
| 573 | + + _EVAL_SET_RESULT_FILE_EXTENSION |
| 574 | + ) |
| 575 | + if not os.path.exists(maybe_eval_result_file_path): |
| 576 | + raise HTTPException( |
| 577 | + status_code=404, |
| 578 | + detail=f"Eval result `{eval_result_id}` not found.", |
| 579 | + ) |
| 580 | + with open(maybe_eval_result_file_path, "r") as file: |
| 581 | + eval_result_data = json.load(file) # Load JSON into a list |
| 582 | + try: |
| 583 | + eval_result = EvalSetResult.model_validate_json(eval_result_data) |
| 584 | + return eval_result |
| 585 | + except ValidationError as e: |
| 586 | + logger.exception("get_eval_result validation error: %s", e) |
| 587 | + |
| 588 | + @app.get( |
| 589 | + "/apps/{app_name}/eval_results", |
| 590 | + response_model_exclude_none=True, |
| 591 | + ) |
| 592 | + def list_eval_results(app_name: str) -> list[str]: |
| 593 | + """Lists all eval results for the given app.""" |
| 594 | + app_eval_history_directory = os.path.join( |
| 595 | + agent_dir, app_name, ".adk", "eval_history" |
| 596 | + ) |
| 597 | + eval_result_files = [ |
| 598 | + file.removesuffix(_EVAL_SET_RESULT_FILE_EXTENSION) |
| 599 | + for file in os.listdir(app_eval_history_directory) |
| 600 | + if file.endswith(_EVAL_SET_RESULT_FILE_EXTENSION) |
504 | 601 | ]
|
| 602 | + return eval_result_files |
505 | 603 |
|
506 | 604 | @app.delete("/apps/{app_name}/users/{user_id}/sessions/{session_id}")
|
507 | 605 | def delete_session(app_name: str, user_id: str, session_id: str):
|
|
0 commit comments