Skip to content

fix: smoke test flakes #524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions pkg/tests/judge/judge.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ import (
openai "github.com/gptscript-ai/chat-completion-client"
)

const instructions = `When given JSON objects that conform to the following JSONSchema:
const instructions = `"actual" is considered equivalent to "expected" if and only if the following rules are satisfied:

%s

Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
When given JSON objects that conform to the following JSONSchema:

%s

Determine if "actual" is considered equivalent to "expected".

After making a determination, respond with a JSON object that conforms to the following JSONSchema:

Expand All @@ -28,7 +31,7 @@ After making a determination, respond with a JSON object that conforms to the fo
},
"reasoning": {
"type": "string",
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
"description": "The reasoning used to come to the determination"
}
},
"required": [
Expand All @@ -41,14 +44,13 @@ Your responses are concise and include only the json object described above.
`

type Judge[T any] struct {
client *openai.Client
instructions string
client *openai.Client
comparisonSchema string
}

type comparison[T any] struct {
Expected T `json:"expected"`
Actual T `json:"actual"`
Criteria string `json:"criteria"`
Expected T `json:"expected"`
Actual T `json:"actual"`
}

type ruling struct {
Expand All @@ -70,22 +72,21 @@ func New[T any](client *openai.Client) (*Judge[T], error) {
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
}

schemaJSON, err := json.MarshalIndent(schema, "", " ")
marshaled, err := json.MarshalIndent(schema, "", " ")
if err != nil {
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
}

return &Judge[T]{
client: client,
instructions: fmt.Sprintf(instructions, schemaJSON),
client: client,
comparisonSchema: string(marshaled),
}, nil
}

func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
Expected: expected,
Actual: actual,
Criteria: criteria,
}, "", " ")
if err != nil {
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
Expand All @@ -101,7 +102,7 @@ func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria strin
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: j.instructions,
Content: fmt.Sprintf(instructions, criteria, j.comparisonSchema),
},
{
Role: openai.ChatMessageRoleUser,
Expand All @@ -111,11 +112,11 @@ func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria strin
}
response, err := j.client.CreateChatCompletion(ctx, request)
if err != nil {
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
return false, "", fmt.Errorf("failed to create chat completion request: %w", err)
}

if len(response.Choices) < 1 {
return false, "", fmt.Errorf("judge chat completion request returned no choices")
return false, "", fmt.Errorf("chat completion request returned no choices")
}

var equality ruling
Expand Down
9 changes: 7 additions & 2 deletions pkg/tests/smoke/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,13 @@ func TestSmoke(t *testing.T) {
ctx,
expectedEvents,
actualEvents,
`The field values of the elements of expected and actual must be roughly equivalent.
Ignore variations in timestamps, IDs, and verbiage when determining equivalence.`,
`
- disregard differences in timestamps, generated IDs, natural language verbiage, and event order
- omit callProgress events from the comparision
- the overall stream of events and set of tools called should roughly match
- arguments passed in tool calls should be roughly the same
- the final callFinish event should be semantically similar
`,
)
require.NoError(t, err, "error getting judge ruling on output")
require.True(t, equal, reasoning)
Expand Down
Loading