7
7
from enum import Enum
8
8
9
9
import requests
10
- from azure .ai .evaluation import ContentSafetyEvaluator
10
+ from azure .ai .evaluation import AzureAIProject , ContentSafetyEvaluator
11
11
from azure .ai .evaluation .simulator import (
12
12
AdversarialScenario ,
13
13
AdversarialSimulator ,
@@ -47,7 +47,7 @@ def get_azure_credential():
47
47
48
48
49
49
async def callback (
50
- messages : list [ dict ] ,
50
+ messages : dict ,
51
51
target_url : str = "http://127.0.0.1:8000/chat" ,
52
52
):
53
53
messages_list = messages ["messages" ]
@@ -70,10 +70,10 @@ async def callback(
70
70
71
71
async def run_simulator (target_url : str , max_simulations : int ):
72
72
credential = get_azure_credential ()
73
- azure_ai_project = {
74
- "subscription_id" : os .getenv ( "AZURE_SUBSCRIPTION_ID" ) ,
75
- "resource_group_name" : os .getenv ( "AZURE_RESOURCE_GROUP" ) ,
76
- "project_name" : os .getenv ( "AZURE_AI_PROJECT" ) ,
73
+ azure_ai_project : AzureAIProject = {
74
+ "subscription_id" : os .environ [ "AZURE_SUBSCRIPTION_ID" ] ,
75
+ "resource_group_name" : os .environ [ "AZURE_RESOURCE_GROUP" ] ,
76
+ "project_name" : os .environ [ "AZURE_AI_PROJECT" ] ,
77
77
}
78
78
79
79
# Simulate single-turn question-and-answering against the app
@@ -91,7 +91,9 @@ async def run_simulator(target_url: str, max_simulations: int):
91
91
# Run safety evaluation on the outputs and save the scores
92
92
# Do not save the outputs, as they may contain disturbing content
93
93
evaluators = ["hate_unfairness" , "sexual" , "violence" , "self_harm" ]
94
- summary_scores = {evaluator : {"low_count" : 0 , "score_total" : 0 } for evaluator in evaluators }
94
+ summary_scores = {
95
+ evaluator : {"low_count" : 0 , "score_total" : 0.0 , "mean_score" : 0.0 , "low_rate" : 0.0 } for evaluator in evaluators
96
+ }
95
97
for output in track (outputs , description = "Evaluating simulated responses..." ):
96
98
query = output ["messages" ][0 ]["content" ]
97
99
answer = output ["messages" ][1 ]["content" ]
@@ -103,7 +105,9 @@ async def run_simulator(target_url: str, max_simulations: int):
103
105
summary_scores [evaluator ]["low_count" ] += 1
104
106
else :
105
107
logger .info (f"Failing score from:\n Q: { query } \n A: { answer } \n { evaluator } score: { eval_score } " )
106
- summary_scores [evaluator ]["score_total" ] += eval_score [f"{ evaluator } _score" ]
108
+ numeric_severity_score = eval_score [f"{ evaluator } _score" ]
109
+ if isinstance (numeric_severity_score , float ):
110
+ summary_scores [evaluator ]["score_total" ] += numeric_severity_score
107
111
108
112
# Compute the overall statistics
109
113
for evaluator in evaluators :
@@ -112,9 +116,6 @@ async def run_simulator(target_url: str, max_simulations: int):
112
116
summary_scores [evaluator ]["score_total" ] / summary_scores [evaluator ]["low_count" ]
113
117
)
114
118
summary_scores [evaluator ]["low_rate" ] = summary_scores [evaluator ]["low_count" ] / len (outputs )
115
- else :
116
- summary_scores [evaluator ]["mean_score" ] = 0
117
- summary_scores [evaluator ]["low_rate" ] = 0
118
119
119
120
# Save summary scores
120
121
with open (root_dir / "safety_results.json" , "w" ) as f :
0 commit comments