diff --git a/backend/pipeline/quality/__init__.py b/backend/pipeline/quality/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/pipeline/quality/__main__.py b/backend/pipeline/quality/__main__.py new file mode 100644 index 0000000..7d6af95 --- /dev/null +++ b/backend/pipeline/quality/__main__.py @@ -0,0 +1,42 @@ +"""FYN-LLM fitness test suite. + +Run with: python -m pipeline.quality fitness +""" +from __future__ import annotations + +import argparse +import sys + +from config import get_settings +from pipeline.llm_client import LLMClient + +from .fitness import FitnessRunner + + +def main() -> int: + parser = argparse.ArgumentParser( + prog="pipeline.quality", + description="FYN-LLM quality assurance toolkit", + ) + sub = parser.add_subparsers(dest="command") + + # -- fitness subcommand -- + sub.add_parser("fitness", help="Run LLM fitness tests across four categories") + + args = parser.parse_args() + + if args.command is None: + parser.print_help() + return 1 + + if args.command == "fitness": + settings = get_settings() + client = LLMClient(settings) + runner = FitnessRunner(client) + return runner.run_all() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/backend/pipeline/quality/fitness.py b/backend/pipeline/quality/fitness.py new file mode 100644 index 0000000..af39e74 --- /dev/null +++ b/backend/pipeline/quality/fitness.py @@ -0,0 +1,489 @@ +"""FYN-LLM fitness test runner. + +Tests four categories: +1. Mandelbrot reasoning — factual knowledge / reasoning depth +2. JSON compliance — simple and nested structured output +3. Instruction following — bullet count, keyword inclusion, casing +4. Diverse prompt battery — summarization, classification, extraction +""" +from __future__ import annotations + +import json +import logging +import time +from dataclasses import dataclass, field + +import openai +from pydantic import BaseModel + +from pipeline.llm_client import LLMClient + +logger = logging.getLogger(__name__) + + +# ── Result types ───────────────────────────────────────────────────────────── + +@dataclass +class TestResult: + """Outcome of a single fitness test.""" + + name: str + passed: bool + elapsed_seconds: float + token_count: int | None = None + detail: str = "" + + +@dataclass +class CategoryReport: + """Results for one test category.""" + + category: str + results: list[TestResult] = field(default_factory=list) + + @property + def all_passed(self) -> bool: + return all(r.passed for r in self.results) + + +# ── Pydantic models for JSON compliance tests ──────────────────────────────── + +class SimpleItem(BaseModel): + name: str + count: int + + +class Address(BaseModel): + street: str + city: str + zip_code: str + + +class PersonWithAddress(BaseModel): + name: str + age: int + address: Address + + +# ── Runner ─────────────────────────────────────────────────────────────────── + +class FitnessRunner: + """Runs all fitness tests against the configured LLM endpoint.""" + + def __init__(self, client: LLMClient) -> None: + self.client = client + + # ── Public entry point ─────────────────────────────────────────────── + + def run_all(self) -> int: + """Run all fitness tests, print report, return exit code (0=pass, 1=fail).""" + categories: list[CategoryReport] = [] + + # Connectivity pre-check — fail fast with a clear message + try: + self._probe_connectivity() + except (openai.APIConnectionError, openai.APITimeoutError) as exc: + url = self.client.settings.llm_api_url + fallback = self.client.settings.llm_fallback_url + print( + f"\n✗ Cannot reach LLM endpoint at {url} (fallback {fallback})\n" + f" Error: {exc}\n" + ) + return 1 + + categories.append(self._run_mandelbrot()) + categories.append(self._run_json_compliance()) + categories.append(self._run_instruction_following()) + categories.append(self._run_diverse_battery()) + + self._print_report(categories) + + return 0 if all(c.all_passed for c in categories) else 1 + + # ── Connectivity probe ─────────────────────────────────────────────── + + def _probe_connectivity(self) -> None: + """Quick completion to verify the endpoint is reachable.""" + self.client.complete( + system_prompt="You are a test probe.", + user_prompt="Respond with the single word: ok", + ) + + # ── Category 1: Mandelbrot reasoning ───────────────────────────────── + + def _run_mandelbrot(self) -> CategoryReport: + cat = CategoryReport(category="Mandelbrot Reasoning") + cat.results.append(self._test_mandelbrot()) + return cat + + def _test_mandelbrot(self) -> TestResult: + name = "mandelbrot_area_knowledge" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="You are a mathematics expert. Answer precisely and concisely.", + user_prompt=( + "What is the approximate area of the Mandelbrot set? " + "Include the numerical value and mention whether the exact area is known." + ), + modality="thinking", + ) + elapsed = time.monotonic() - t0 + text = resp.lower() + # Check for key concepts + has_area = any(kw in text for kw in ["1.506", "1.507", "1.50659"]) + has_uncertainty = any( + kw in text + for kw in ["not exactly known", "not known exactly", "approximate", "estimated", "conjecture"] + ) + passed = has_area and has_uncertainty + detail = "" if passed else f"Missing: area={has_area}, uncertainty={has_uncertainty}. Response: {resp[:200]}" + return TestResult( + name=name, + passed=passed, + elapsed_seconds=round(elapsed, 2), + token_count=resp.completion_tokens, + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, + passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + # ── Category 2: JSON compliance ────────────────────────────────────── + + def _run_json_compliance(self) -> CategoryReport: + cat = CategoryReport(category="JSON Compliance") + cat.results.append(self._test_json_simple()) + cat.results.append(self._test_json_nested()) + return cat + + def _test_json_simple(self) -> TestResult: + name = "json_simple_object" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="You are a JSON generator. Output ONLY valid JSON, nothing else.", + user_prompt=( + 'Generate a JSON object with exactly two keys: "name" (a string) ' + 'and "count" (an integer). Example structure: {"name": "...", "count": N}' + ), + response_model=SimpleItem, + modality="chat", + ) + elapsed = time.monotonic() - t0 + return self._validate_json(name, resp, SimpleItem, elapsed) + except Exception as exc: + return TestResult( + name=name, + passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + def _test_json_nested(self) -> TestResult: + name = "json_nested_object" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="You are a JSON generator. Output ONLY valid JSON, nothing else.", + user_prompt=( + 'Generate a JSON object with keys "name" (string), "age" (integer), ' + 'and "address" (object with "street", "city", "zip_code" string fields).' + ), + response_model=PersonWithAddress, + modality="chat", + ) + elapsed = time.monotonic() - t0 + return self._validate_json(name, resp, PersonWithAddress, elapsed) + except Exception as exc: + return TestResult( + name=name, + passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + def _validate_json( + self, + name: str, + resp: str, + model: type[BaseModel], + elapsed: float, + ) -> TestResult: + """Parse response as JSON, validate against Pydantic model.""" + text = str(resp).strip() + if not text: + return TestResult( + name=name, passed=False, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + detail="Empty response from LLM", + ) + try: + parsed = json.loads(text) + except json.JSONDecodeError as exc: + return TestResult( + name=name, passed=False, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + detail=f"Invalid JSON: {exc}. Raw: {text[:200]}", + ) + try: + model.model_validate(parsed) + except Exception as exc: + return TestResult( + name=name, passed=False, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + detail=f"Schema validation failed: {exc}", + ) + return TestResult( + name=name, passed=True, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + ) + + # ── Category 3: Instruction following ──────────────────────────────── + + def _run_instruction_following(self) -> CategoryReport: + cat = CategoryReport(category="Instruction Following") + cat.results.append(self._test_bullet_count()) + cat.results.append(self._test_keyword_inclusion()) + cat.results.append(self._test_lowercase_only()) + return cat + + def _test_bullet_count(self) -> TestResult: + name = "instruction_bullet_count" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="Follow instructions exactly.", + user_prompt="List exactly 3 benefits of exercise. Use bullet points starting with '- '.", + ) + elapsed = time.monotonic() - t0 + lines = [l.strip() for l in str(resp).strip().splitlines() if l.strip().startswith("- ")] + passed = len(lines) == 3 + detail = "" if passed else f"Expected 3 bullets, got {len(lines)}: {str(resp)[:200]}" + return TestResult( + name=name, passed=passed, elapsed_seconds=round(elapsed, 2), + token_count=resp.completion_tokens, + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + def _test_keyword_inclusion(self) -> TestResult: + name = "instruction_keyword_inclusion" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="Follow instructions exactly.", + user_prompt=( + "Write one sentence about the weather. " + 'You MUST include the word "elephant" somewhere in your sentence.' + ), + ) + elapsed = time.monotonic() - t0 + passed = "elephant" in str(resp).lower() + detail = "" if passed else f"Missing keyword 'elephant'. Response: {str(resp)[:200]}" + return TestResult( + name=name, passed=passed, elapsed_seconds=round(elapsed, 2), + token_count=resp.completion_tokens, + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + def _test_lowercase_only(self) -> TestResult: + name = "instruction_lowercase_only" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="Follow instructions exactly.", + user_prompt=( + "Write a short sentence about the ocean. " + "Use ONLY lowercase letters — no uppercase at all, not even at the start." + ), + ) + elapsed = time.monotonic() - t0 + text = str(resp).strip() + # Allow non-alpha chars (punctuation, spaces, numbers) but no uppercase letters + has_upper = any(c.isupper() for c in text) + passed = not has_upper and len(text) > 5 + detail = "" if passed else f"Contains uppercase or too short. Response: {text[:200]}" + return TestResult( + name=name, passed=passed, elapsed_seconds=round(elapsed, 2), + token_count=resp.completion_tokens, + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + # ── Category 4: Diverse prompt battery ─────────────────────────────── + + def _run_diverse_battery(self) -> CategoryReport: + cat = CategoryReport(category="Diverse Prompt Battery") + cat.results.append(self._test_summarization()) + cat.results.append(self._test_classification()) + cat.results.append(self._test_extraction()) + return cat + + def _test_summarization(self) -> TestResult: + name = "battery_summarization" + paragraph = ( + "The James Webb Space Telescope (JWST) is the largest optical telescope in space. " + "Launched in December 2021, it is designed to conduct infrared astronomy. Its high " + "resolution and sensitivity allow it to view objects too old and distant for the Hubble " + "Space Telescope. Among its goals are observing the first stars and the formation of " + "the first galaxies, and detailed atmospheric characterization of exoplanets." + ) + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="You are a concise summarizer.", + user_prompt=f"Summarize the following in exactly 2 sentences:\n\n{paragraph}", + ) + elapsed = time.monotonic() - t0 + text = str(resp).strip() + # Rough sentence count: split on period followed by space or end + sentences = [s.strip() for s in text.replace("! ", ". ").split(". ") if s.strip()] + # Be generous: 1-3 sentences is acceptable + passed = 1 <= len(sentences) <= 3 and len(text) > 20 + detail = "" if passed else f"Expected ~2 sentences, got {len(sentences)}. Response: {text[:200]}" + return TestResult( + name=name, passed=passed, elapsed_seconds=round(elapsed, 2), + token_count=resp.completion_tokens, + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + def _test_classification(self) -> TestResult: + name = "battery_classification" + categories = ["technology", "sports", "politics", "science", "entertainment"] + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt=( + "You are a text classifier. Respond with ONLY one word from the given categories." + ), + user_prompt=( + f"Classify the following text into one of these categories: {', '.join(categories)}\n\n" + "Text: \"NASA's Perseverance rover has discovered organic molecules on Mars, " + "suggesting the planet may have once harbored microbial life.\"\n\n" + "Category:" + ), + ) + elapsed = time.monotonic() - t0 + answer = str(resp).strip().lower().rstrip(".") + passed = answer in categories + detail = "" if passed else f"Response '{answer}' not in {categories}" + return TestResult( + name=name, passed=passed, elapsed_seconds=round(elapsed, 2), + token_count=resp.completion_tokens, + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + def _test_extraction(self) -> TestResult: + name = "battery_extraction" + t0 = time.monotonic() + try: + resp = self.client.complete( + system_prompt="You are a data extractor. Output ONLY valid JSON, nothing else.", + user_prompt=( + "Extract the following fields as a JSON object: " + '"event_name", "date", "location"\n\n' + "Text: \"The annual Tech Summit 2026 will be held on March 15, 2026 " + 'in San Francisco, California."\n\n' + "JSON:" + ), + response_model=BaseModel, # triggers json mode + modality="chat", + ) + elapsed = time.monotonic() - t0 + text = str(resp).strip() + if not text: + return TestResult( + name=name, passed=False, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + detail="Empty response from LLM", + ) + try: + parsed = json.loads(text) + except json.JSONDecodeError as exc: + return TestResult( + name=name, passed=False, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + detail=f"Invalid JSON: {exc}. Raw: {text[:200]}", + ) + required_keys = {"event_name", "date", "location"} + present = set(parsed.keys()) & required_keys + passed = present == required_keys + detail = "" if passed else f"Missing keys: {required_keys - present}" + return TestResult( + name=name, passed=passed, elapsed_seconds=round(elapsed, 2), + token_count=getattr(resp, "completion_tokens", None), + detail=detail, + ) + except Exception as exc: + return TestResult( + name=name, passed=False, + elapsed_seconds=round(time.monotonic() - t0, 2), + detail=f"Exception: {exc}", + ) + + # ── Report formatting ──────────────────────────────────────────────── + + def _print_report(self, categories: list[CategoryReport]) -> None: + """Print a formatted pass/fail report to stdout.""" + total = 0 + passed_count = 0 + + print("\n" + "=" * 60) + print(" FYN-LLM FITNESS REPORT") + print("=" * 60) + + for cat in categories: + status = "✓ PASS" if cat.all_passed else "✗ FAIL" + print(f"\n [{status}] {cat.category}") + for r in cat.results: + total += 1 + icon = "✓" if r.passed else "✗" + tokens = f" ({r.token_count} tok)" if r.token_count else "" + print(f" {icon} {r.name} [{r.elapsed_seconds}s{tokens}]") + if r.detail: + # Indent detail lines + for line in r.detail.splitlines(): + print(f" {line}") + if r.passed: + passed_count += 1 + + print("\n" + "-" * 60) + print(f" Total: {passed_count}/{total} passed") + if passed_count == total: + print(" Result: ✓ ALL PASS") + else: + print(f" Result: ✗ {total - passed_count} FAILED") + print("=" * 60 + "\n")