Evaluator is the core component in the Blades framework for assessing the quality of AI model responses. It provides a structured way to judge the relevance and accuracy of model outputs.
Evaluator is used to return evaluation and feedback results. Its interface structure is as follows:
type Evaluator interface { Evaluate(context.Context, *blades.Prompt) (*Evaluation, error)}The Evaluation struct is used to represent the evaluation result.
type Evaluation struct { Pass bool `json:"pass" jsonschema:"Indicates whether the response satisfies the evaluation criteria."` Score float64 `json:"score" jsonschema:"LLM-judged similarity to the expected response; score in [0,1], higher is better."` Feedback *Feedback `json:"feedback" jsonschema:"Structured feedback on the evaluation results."`}The Feedback struct is used to represent the feedback information for the evaluation result.
type Feedback struct { Summary string `json:"summary" jsonschema:"Short summary of evaluation results."` Details string `json:"details" jsonschema:"Detailed explanation of strengths, weaknesses, and reasoning."` Suggestions []string `json:"suggestions" jsonschema:"List of recommended improvements or fixes."`}evaluator, err := evaluate.NewCriteria( "Evaluation Agent", blades.WithModel("gpt-4"), blades.WithProvider(openai.NewChatProvider()),)if err != nil { log.Fatal(err)}prompt, err := blades.NewPromptTemplate(). System(evaluationTmpl, map[string]any{ "Input": "What is the capital of France?", "Output": "Paris", }). Build()if err != nil { log.Fatal(err)}result, err := evaluator.Evaluate(context.Background(), prompt)if err != nil { log.Fatal(err)}
fmt.Printf("Pass: %t Score: %f\n", result.Pass, result.Score)fmt.Printf("Feedback: %+v\n", result.Feedback)package examples
import ( "context" "fmt" "log"
"github.com/go-kratos/blades" "github.com/go-kratos/blades/contrib/openai" "github.com/go-kratos/blades/evaluate")
const evaluationTmpl = `You are a professional evaluation expert. Your task is to assess the relevance of AI model responses to a given prompt.
Please use the above guidelines to evaluate the AI model's response.
The input content is as follows:{ "User Prompt": {{ .Input }}, "Model Response": {{ .Output }},},in JSON Format`
func Evaluate_example() { // Test data qa := map[string]string{ "what is the capital of france": "Paris", "convert 5 kilometers to meters": "60 kilometers/hour.", }
// Create evaluator evaluator, err := evaluate.NewCriteria( "Evaluation Agent", blades.WithModel("qwen-plus"), blades.WithProvider(openai.NewChatProvider()), ) if err != nil { log.Fatal(err) }
// Evaluate question-answer pairs one by one for question, answer := range qa { // Construct evaluation prompt prompt, err := blades.NewPromptTemplate(). System(evaluationTmpl, map[string]any{ "Input": question, "Output": answer, }). User("make a evaluation"). Build() if err != nil { log.Fatal(err) }
// Execute evaluation result, err := evaluator.Evaluate(context.Background(), prompt) if err != nil { log.Fatal(err) }
// Output results fmt.Printf("Question: %s\n", question) fmt.Printf("Answer: %s\n", answer) fmt.Printf("Pass: %t Score: %.2f\n", result.Pass, result.Score) if result.Feedback != nil { fmt.Printf("Summary: %s\n", result.Feedback.Summary) fmt.Printf("Details: %s\n", result.Feedback.Details) fmt.Printf("Suggestions: %v\n", result.Feedback.Suggestions) } fmt.Println("---") }}