Building Custom LLM Judges

March 18, 2026 · 4 min read

Evaluate GenAI application outputs using built-in guideline scorers, custom programmatic scorers, and custom LLM-based judges.

Prerequisites

pip install mlflow openai

import mlflow
import openai

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("custom-judges")

mlflow.openai.autolog()

client = openai.OpenAI()


@mlflow.trace
def answer_question(question: str) -> str:
    response = client.chat.completions.create(
        model="gpt-5.4-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a medical information assistant. "
                    "Always include a disclaimer that users "
                    "should consult a healthcare professional. "
                    "Cite sources when available. "
                    "Structure your response with sections: "
                    "Overview, Details, and Disclaimer."
                ),
            },
            {"role": "user", "content": question},
        ],
    )
    return response.choices[0].message.content

Verify the agent works:

print(answer_question("What are the symptoms of iron deficiency?"))

eval_data = [
    {
        "inputs": {
            "question": "What are common symptoms of vitamin D deficiency?"
        },
        "expectations": {
            "required_sections": [
                "Overview",
                "Details",
                "Disclaimer",
            ]
        },
    },
    {
        "inputs": {
            "question": "How does melatonin affect sleep?"
        },
        "expectations": {
            "required_sections": [
                "Overview",
                "Details",
                "Disclaimer",
            ]
        },
    },
    {
        "inputs": {
            "question": "What are the risks of prolonged sitting?"
        },
        "expectations": {
            "required_sections": [
                "Overview",
                "Details",
                "Disclaimer",
            ]
        },
    },
    {
        "inputs": {
            "question": "What causes migraines?"
        },
        "expectations": {
            "required_sections": [
                "Overview",
                "Details",
                "Disclaimer",
            ]
        },
    },
]

The parameter name must match the key in the inputs dicts.

def predict_fn(question):
    return answer_question(question)

The Guidelines scorer checks whether responses follow specific rules. Each Guidelines instance evaluates its own set of guidelines independently.

from mlflow.genai.scorers import Guidelines

conciseness = Guidelines(
    name="conciseness",
    guidelines=[
        "Response must be under 500 words",
        "Response must avoid unnecessary filler phrases",
    ],
)

source_citation = Guidelines(
    name="source_citation",
    guidelines=[
        "Response must cite sources when making factual claims",
        "Response must not present opinions as facts",
    ],
)

Run evaluation with just these two guideline scorers:

guideline_results = mlflow.genai.evaluate(
    data=eval_data,
    predict_fn=predict_fn,
    scorers=[conciseness, source_citation],
)

print(guideline_results.metrics)
# Example output:
# {'conciseness/mean': 1.0, 'source_citation/mean': 0.8}

df = guideline_results.result_df
print(
    df[
        [
            "inputs/question",
            "conciseness/value",
            "source_citation/value",
            "source_citation/rationale",
        ]
    ]
)

Use the @scorer decorator to write a scorer that checks domain-specific logic with code -- no LLM calls needed.

This scorer verifies that responses contain the required sections defined in expectations.

from mlflow.genai.scorers import scorer
from mlflow.entities import Feedback


@scorer
def has_required_sections(outputs, expectations) -> Feedback:
    required = expectations.get("required_sections", [])
    missing = [
        section
        for section in required
        if section.lower() not in outputs.lower()
    ]

    if missing:
        return Feedback(
            value=False,
            rationale=(
                f"Missing sections: {', '.join(missing)}"
            ),
        )
    return Feedback(
        value=True,
        rationale="All required sections present",
    )

For evaluation criteria that require nuanced judgment, write a scorer that calls an LLM directly. This scorer asks GPT-5.4-mini to rate the medical accuracy and tone of responses.

import json


@scorer
def medical_tone_judge(outputs) -> Feedback:
    prompt = f"""Rate the following medical information response.
Check these criteria:
1. Uses appropriate medical terminology
2. Avoids definitive diagnostic language
3. Includes a disclaimer about consulting professionals
4. Maintains a neutral, informative tone

Response to evaluate:
{outputs}

Return JSON with this exact format:
{{
    "pass": true,
    "rationale": "Brief explanation of your rating"
}}
"""
    response = openai.OpenAI().chat.completions.create(
        model="gpt-5.4-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"},
    )

    result = json.loads(
        response.choices[0].message.content
    )
    return Feedback(
        value=result["pass"],
        rationale=result["rationale"],
        source=AssessmentSource(
            source_type="LLM_JUDGE",
            source_id="openai:/gpt-5.4-mini",
        ),
    )

Import AssessmentSource at the top of your script:

from mlflow.entities import AssessmentSource, Feedback

Compare built-in guideline scorers against custom scorers in a single evaluation run.

all_results = mlflow.genai.evaluate(
    data=eval_data,
    predict_fn=predict_fn,
    scorers=[
        conciseness,
        source_citation,
        has_required_sections,
        medical_tone_judge,
    ],
)

# Aggregate metrics across all scorers
print(all_results.metrics)
# Example output:
# {
#   'conciseness/mean': 1.0,
#   'source_citation/mean': 0.8,
#   'has_required_sections/mean': 0.75,
#   'medical_tone_judge/mean': 1.0
# }

# Per-row breakdown
df = all_results.result_df
score_columns = [
    c for c in df.columns if c.endswith("/value")
]
print(df[["inputs/question"] + score_columns])

Inspect rationales for rows that failed a scorer:

for _, row in df.iterrows():
    question = row["inputs/question"]
    for col in df.columns:
        if (
            col.endswith("/value")
            and row[col] is False
        ):
            scorer_name = col.replace("/value", "")
            rationale_col = f"{scorer_name}/rationale"
            print(f"Question: {question}")
            print(f"  Failed: {scorer_name}")
            print(f"  Reason: {row[rationale_col]}")
            print()

Open the MLflow UI at http://127.0.0.1:5000 to see the evaluation run, per-row scores, and linked traces for each question.

Next Steps

End-to-End RAG Evaluation -- Evaluate retrieval and generation quality together
Built-in Scorers Reference -- Full list of available scorers

LLMs & Agents

Model Training

LLMs & Agents

Model Training

Cookbook

Ambassador Program

Building Custom LLM Judges

Next Steps

LLMs & Agents

Model Training

LLMs & Agents

Model Training

Cookbook

Ambassador Program

Next Steps​

Next Steps