Skip to content

Commit 8a283fd

Browse files
authored
Merge pull request #524 from njhale/fix/smoke-flakes
fix: smoke test flakes
2 parents c25c7be + 1500872 commit 8a283fd

12 files changed

+2021
-3264
lines changed

pkg/tests/judge/judge.go

+17-16
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@ import (
99
openai "github.com/gptscript-ai/chat-completion-client"
1010
)
1111

12-
const instructions = `When given JSON objects that conform to the following JSONSchema:
12+
const instructions = `"actual" is considered equivalent to "expected" if and only if the following rules are satisfied:
1313
1414
%s
1515
16-
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
17-
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
16+
When given JSON objects that conform to the following JSONSchema:
17+
18+
%s
19+
20+
Determine if "actual" is considered equivalent to "expected".
1821
1922
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
2023
@@ -28,7 +31,7 @@ After making a determination, respond with a JSON object that conforms to the fo
2831
},
2932
"reasoning": {
3033
"type": "string",
31-
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
34+
"description": "The reasoning used to come to the determination"
3235
}
3336
},
3437
"required": [
@@ -41,14 +44,13 @@ Your responses are concise and include only the json object described above.
4144
`
4245

4346
type Judge[T any] struct {
44-
client *openai.Client
45-
instructions string
47+
client *openai.Client
48+
comparisonSchema string
4649
}
4750

4851
type comparison[T any] struct {
49-
Expected T `json:"expected"`
50-
Actual T `json:"actual"`
51-
Criteria string `json:"criteria"`
52+
Expected T `json:"expected"`
53+
Actual T `json:"actual"`
5254
}
5355

5456
type ruling struct {
@@ -70,22 +72,21 @@ func New[T any](client *openai.Client) (*Judge[T], error) {
7072
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
7173
}
7274

73-
schemaJSON, err := json.MarshalIndent(schema, "", " ")
75+
marshaled, err := json.MarshalIndent(schema, "", " ")
7476
if err != nil {
7577
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
7678
}
7779

7880
return &Judge[T]{
79-
client: client,
80-
instructions: fmt.Sprintf(instructions, schemaJSON),
81+
client: client,
82+
comparisonSchema: string(marshaled),
8183
}, nil
8284
}
8385

8486
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
8587
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
8688
Expected: expected,
8789
Actual: actual,
88-
Criteria: criteria,
8990
}, "", " ")
9091
if err != nil {
9192
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
@@ -101,7 +102,7 @@ func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria strin
101102
Messages: []openai.ChatCompletionMessage{
102103
{
103104
Role: openai.ChatMessageRoleSystem,
104-
Content: j.instructions,
105+
Content: fmt.Sprintf(instructions, criteria, j.comparisonSchema),
105106
},
106107
{
107108
Role: openai.ChatMessageRoleUser,
@@ -111,11 +112,11 @@ func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria strin
111112
}
112113
response, err := j.client.CreateChatCompletion(ctx, request)
113114
if err != nil {
114-
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
115+
return false, "", fmt.Errorf("failed to create chat completion request: %w", err)
115116
}
116117

117118
if len(response.Choices) < 1 {
118-
return false, "", fmt.Errorf("judge chat completion request returned no choices")
119+
return false, "", fmt.Errorf("chat completion request returned no choices")
119120
}
120121

121122
var equality ruling

pkg/tests/smoke/smoke_test.go

+7-2
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,13 @@ func TestSmoke(t *testing.T) {
8181
ctx,
8282
expectedEvents,
8383
actualEvents,
84-
`The field values of the elements of expected and actual must be roughly equivalent.
85-
Ignore variations in timestamps, IDs, and verbiage when determining equivalence.`,
84+
`
85+
- disregard differences in timestamps, generated IDs, natural language verbiage, and event order
86+
- omit callProgress events from the comparision
87+
- the overall stream of events and set of tools called should roughly match
88+
- arguments passed in tool calls should be roughly the same
89+
- the final callFinish event should be semantically similar
90+
`,
8691
)
8792
require.NoError(t, err, "error getting judge ruling on output")
8893
require.True(t, equal, reasoning)

0 commit comments

Comments
 (0)