Adds format.py scorer supporting four validation modes: - json: validates parseable JSON - markdown: checks for headers (0.5) and lists (0.5) - length: proportional scoring against min/max token bounds - structure: JSON schema validation via jsonschema library Includes 38 passing tests covering all format types, edge cases, and async delegation.
228 lines
8 KiB
Python
228 lines
8 KiB
Python
"""Tests for the FormatScorer."""
|
|
|
|
import asyncio
|
|
import json
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from engine.scorers.format import FormatScorer
|
|
|
|
|
|
class TestFormatScorerInit:
|
|
def test_valid_format_types(self):
|
|
for fmt in ("json", "markdown", "length", "structure"):
|
|
scorer = FormatScorer(format_type=fmt)
|
|
assert scorer.format_type == fmt
|
|
|
|
def test_invalid_format_type_raises(self):
|
|
with pytest.raises(ValueError, match="Invalid format_type"):
|
|
FormatScorer(format_type="xml")
|
|
|
|
def test_name_property(self):
|
|
scorer = FormatScorer()
|
|
assert scorer.name == "format"
|
|
|
|
def test_is_base_scorer(self):
|
|
from engine.scorers.base import BaseScorer
|
|
scorer = FormatScorer()
|
|
assert isinstance(scorer, BaseScorer)
|
|
|
|
|
|
class TestJsonFormat:
|
|
def test_valid_json_object(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, '{"key": "value"}', {}) == 1.0
|
|
|
|
def test_valid_json_array(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, '[1, 2, 3]', {}) == 1.0
|
|
|
|
def test_valid_json_string(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, '"hello"', {}) == 1.0
|
|
|
|
def test_valid_json_number(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, '42', {}) == 1.0
|
|
|
|
def test_valid_json_with_whitespace(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, ' {"key": "value"} ', {}) == 1.0
|
|
|
|
def test_invalid_json(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, "not json at all", {}) == 0.0
|
|
|
|
def test_empty_string(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, "", {}) == 0.0
|
|
|
|
def test_partial_json(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
assert scorer.score(None, '{"key":', {}) == 0.0
|
|
|
|
|
|
class TestMarkdownFormat:
|
|
def test_headers_only(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "# Title\n\nSome text here."
|
|
assert scorer.score(None, output, {}) == 0.5
|
|
|
|
def test_lists_only_unordered(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "Some text\n- item one\n- item two"
|
|
assert scorer.score(None, output, {}) == 0.5
|
|
|
|
def test_lists_only_ordered(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "Some text\n1. first\n2. second"
|
|
assert scorer.score(None, output, {}) == 0.5
|
|
|
|
def test_both_headers_and_lists(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "# Title\n\n- item one\n- item two"
|
|
assert scorer.score(None, output, {}) == 1.0
|
|
|
|
def test_no_markdown(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "Just plain text without any formatting."
|
|
assert scorer.score(None, output, {}) == 0.0
|
|
|
|
def test_nested_header_levels(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "## Subtitle\n\nContent here"
|
|
assert scorer.score(None, output, {}) == 0.5
|
|
|
|
def test_asterisk_list(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "Some text\n* item one\n* item two"
|
|
assert scorer.score(None, output, {}) == 0.5
|
|
|
|
def test_ordered_list_with_parenthesis(self):
|
|
scorer = FormatScorer(format_type="markdown")
|
|
output = "Text\n1) first\n2) second"
|
|
assert scorer.score(None, output, {}) == 0.5
|
|
|
|
|
|
class TestLengthFormat:
|
|
def test_within_range(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=5, max_tokens=20)
|
|
output = "this is a ten word sentence for the test case"
|
|
assert scorer.score(None, output, {}) == 1.0
|
|
|
|
def test_exact_min(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=3, max_tokens=10)
|
|
assert scorer.score(None, "one two three", {}) == 1.0
|
|
|
|
def test_exact_max(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=1, max_tokens=3)
|
|
assert scorer.score(None, "one two three", {}) == 1.0
|
|
|
|
def test_below_min(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=10, max_tokens=20)
|
|
output = "only five words here now"
|
|
result = scorer.score(None, output, {})
|
|
assert 0.0 < result < 1.0
|
|
assert result == 5 / 10 # 0.5
|
|
|
|
def test_above_max(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=1, max_tokens=5)
|
|
output = "one two three four five six seven eight nine ten"
|
|
result = scorer.score(None, output, {})
|
|
assert 0.0 <= result < 1.0
|
|
|
|
def test_no_bounds(self):
|
|
scorer = FormatScorer(format_type="length")
|
|
assert scorer.score(None, "any text", {}) == 1.0
|
|
|
|
def test_only_min(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=3)
|
|
assert scorer.score(None, "one two three four", {}) == 1.0
|
|
|
|
def test_only_max(self):
|
|
scorer = FormatScorer(format_type="length", max_tokens=5)
|
|
assert scorer.score(None, "one two", {}) == 1.0
|
|
|
|
def test_empty_output(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=5)
|
|
# empty string splits to [''], which has length 1
|
|
result = scorer.score(None, "", {})
|
|
assert result < 1.0
|
|
|
|
def test_zero_min(self):
|
|
scorer = FormatScorer(format_type="length", min_tokens=0, max_tokens=10)
|
|
assert scorer.score(None, "hello", {}) == 1.0
|
|
|
|
|
|
class TestStructureFormat:
|
|
def test_valid_structure(self):
|
|
schema = {
|
|
"type": "object",
|
|
"required": ["name", "age"],
|
|
"properties": {
|
|
"name": {"type": "string"},
|
|
"age": {"type": "integer"},
|
|
},
|
|
}
|
|
scorer = FormatScorer(format_type="structure", json_schema=schema)
|
|
output = json.dumps({"name": "Alice", "age": 30})
|
|
assert scorer.score(None, output, {}) == 1.0
|
|
|
|
def test_missing_required_field(self):
|
|
schema = {
|
|
"type": "object",
|
|
"required": ["name", "age"],
|
|
"properties": {
|
|
"name": {"type": "string"},
|
|
"age": {"type": "integer"},
|
|
},
|
|
}
|
|
scorer = FormatScorer(format_type="structure", json_schema=schema)
|
|
output = json.dumps({"name": "Alice"})
|
|
assert scorer.score(None, output, {}) == 0.0
|
|
|
|
def test_wrong_type(self):
|
|
schema = {"type": "array"}
|
|
scorer = FormatScorer(format_type="structure", json_schema=schema)
|
|
output = json.dumps({"key": "value"})
|
|
assert scorer.score(None, output, {}) == 0.0
|
|
|
|
def test_valid_array_structure(self):
|
|
schema = {"type": "array"}
|
|
scorer = FormatScorer(format_type="structure", json_schema=schema)
|
|
output = json.dumps([1, 2, 3])
|
|
assert scorer.score(None, output, {}) == 1.0
|
|
|
|
def test_no_schema_returns_zero(self):
|
|
scorer = FormatScorer(format_type="structure")
|
|
assert scorer.score(None, '{"key": "value"}', {}) == 0.0
|
|
|
|
def test_invalid_json_for_structure(self):
|
|
schema = {"type": "object"}
|
|
scorer = FormatScorer(format_type="structure", json_schema=schema)
|
|
assert scorer.score(None, "not json", {}) == 0.0
|
|
|
|
def test_complex_schema(self):
|
|
schema = {
|
|
"type": "object",
|
|
"required": ["results"],
|
|
"properties": {
|
|
"results": {
|
|
"type": "array",
|
|
"items": {"type": "object"},
|
|
},
|
|
},
|
|
}
|
|
scorer = FormatScorer(format_type="structure", json_schema=schema)
|
|
output = json.dumps({"results": [{"id": 1}, {"id": 2}]})
|
|
assert scorer.score(None, output, {}) == 1.0
|
|
|
|
|
|
class TestAsyncScoring:
|
|
def test_async_delegates_to_sync(self):
|
|
scorer = FormatScorer(format_type="json")
|
|
result = asyncio.get_event_loop().run_until_complete(
|
|
scorer.score_async(None, '{"valid": true}', {})
|
|
)
|
|
assert result == 1.0
|