In [27]:
import os
from openai import OpenAI
import dotenv
import pandas as pd

dotenv.load_dotenv()
Out[27]:
True

Suppose we want to test out a new LLM model that we've built, or maybe we have fine tuned an existing model, and we want to see how well it performs on a question-answering task. Instead of manually evaluating the answers, we can use another LLM-as-a-judge to assess the quality of the answers. This notebook demonstrates how to set up such a system using OpenAI's GPT models.

In [28]:
import os, json, math
from typing import List, Dict, Tuple
import numpy as np
from openai import OpenAI

JUDGE_MODEL = "gpt-5-mini"
BASE_MODEL = "gpt-3.5-turbo"
EMBED_MODEL = "text-embedding-3-large"
SEPARATOR = "<<<<>>>>>"
TOP_K = 4

client = OpenAI()


# embedding retriever
def _normalize_rows(x: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(x, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return x / norms


def embed_texts(texts: List[str]) -> np.ndarray:
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    return np.array([d.embedding for d in resp.data], dtype=np.float32)


class SimpleRetriever:
    def __init__(self, texts: List[str], embeddings: np.ndarray):
        assert len(texts) == embeddings.shape[0]
        self.texts = texts
        self.mat = _normalize_rows(embeddings.astype(np.float32))

    @classmethod
    def from_texts(cls, texts: List[str]):
        embs = embed_texts(texts)
        return cls(texts, embs)

    def query(
        self, q: str, top_k: int = TOP_K
    ) -> List[Tuple[str, float, int]]:
        q_emb = embed_texts([q])[0]
        q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)
        sims = self.mat @ q_emb
        idx = np.argsort(-sims)[:top_k]
        return [(self.texts[i], float(sims[i]), int(i)) for i in idx]


def generate_qa_examples(
    judge_model: str, docs: List[str], n_per_doc: int = 2
) -> List[Dict]:
    """Ask the model to propose Q/A pairs grounded in each doc. Returns list of {doc_id, question, answer}."""
    out = []
    for i, d in enumerate(docs):
        prompt = f"""
You are given a document. Generate {n_per_doc} diverse, factual Q&A pairs grounded ONLY in the document.
Return strict JSON with a list under "items", where each item has "question" and "answer".
No markdown. No extra keys.

Document:
{d}
"""
        resp = client.chat.completions.create(
            model=judge_model,
            messages=[
                {
                    "role": "system",
                    "content": "Return only the JSON object requested.",
                },
                {"role": "user", "content": prompt.strip()},
            ],
        )
        txt = resp.choices[0].message.content.strip()
        try:
            data = json.loads(txt)
            for it in data.get("items", []):
                out.append(
                    {
                        "doc_id": i,
                        "question": it["question"],
                        "answer": it["answer"],
                    }
                )
        except json.JSONDecodeError:
            # Fallback: skip malformed output
            continue
    return out


def answer_with_context(
    model: str,
    question: str,
    contexts: List[Tuple[str, float, int]],
    separator: str = SEPARATOR,
) -> Dict:
    # contexts: list of (text, score, doc_idx)
    context_block = f"{separator}".join([c[0] for c in contexts])
    sys = "You answer strictly from the provided context. If missing, say you don't know. Cite sources as doc[i] indices."
    usr = f"""Context documents (ordered by similarity):
{separator.join([f"[doc[{i}]] {t}" for (t, _, i) in contexts])}

Question: {question}

Instructions:
- Answer ONLY with information supported by the context.
- Include a "Sources:" line with doc indices you actually used, e.g. Sources: doc[0], doc[3].
"""

    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": sys},
            {"role": "user", "content": usr},
        ],
    )
    return {
        "answer": resp.choices[0].message.content.strip(),
        "contexts_used": [i for (_, _, i) in contexts],
    }


# LLM-as-judge
def judge_answer(
    model: str,
    question: str,
    gold_answer: str,
    predicted: str,
    contexts: List[str],
) -> Dict:
    """Ask a judge model to rate correctness and groundedness on [0,1]."""
    judge_prompt = f"""
Rate the following on a 0..1 scale.

Task:
- Correctness: Is the predicted answer semantically correct compared to the gold?
- Groundedness: Does the predicted answer stay faithful to the provided context without hallucination?

Return JSON: {{"correctness": <float>, "groundedness": <float>, "notes": "<short reason>"}}

Gold answer:
{gold_answer}

Predicted answer:
{predicted}

Context (for groundedness check):
{SEPARATOR.join(contexts)}
"""
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Return only the requested JSON object. Be strict and concise.",
            },
            {"role": "user", "content": judge_prompt.strip()},
        ],
    )
    txt = resp.choices[0].message.content.strip()
    try:
        data = json.loads(txt)
    except json.JSONDecodeError:
        print("DEBUG JUDGE OUTPUT:", txt)
        data = {
            "correctness": None,
            "groundedness": None,
            "notes": "Judge output not JSON",
        }
    return data


def evaluate_qa_examples(
    retriever: SimpleRetriever,
    qa_examples: List[Dict],
    base_model: str,
    judge_model: str,
    top_k: int = TOP_K,
    separator: str = SEPARATOR,
) -> List[Dict]:
    """Evaluate QA examples using retrieval and LLM-as-judge."""

    results = []
    for ex in qa_examples:
        q = ex["question"]
        gold = ex["answer"]
        hits = retriever.query(q, top_k=TOP_K)
        if "incorrect_answer" in ex:
            pred_text = ex[
                "incorrect_answer"
            ]  # use an incorrect answer for testing
        else:
            pred = answer_with_context(BASE_MODEL, q, hits, SEPARATOR)
            pred_text = pred["answer"]

        # build context strings for judge
        ctx_texts = [h[0] for h in hits]
        score = judge_answer(JUDGE_MODEL, q, gold, pred_text, ctx_texts)

        results.append(
            {
                "question": q,
                "gold": gold,
                "pred": pred_text,
                "contexts": [
                    dict(text=h[0], score=h[1], doc_idx=h[2]) for h in hits
                ],
                "judge": score,
            }
        )
    return results


def report_on_evaluations(evaluations: List[Dict]):
    """Generate a simple report on evaluation results."""
    # simple report
    valid = [
        r
        for r in evaluations
        if isinstance(r["judge"].get("correctness"), (int, float))
        and isinstance(r["judge"].get("groundedness"), (int, float))
    ]
    if valid:
        avg_correct = sum(r["judge"]["correctness"] for r in valid) / len(
            valid
        )
        avg_ground = sum(r["judge"]["groundedness"] for r in valid) / len(
            valid
        )
        print(
            f"Avg correctness: {avg_correct:.3f} | Avg groundedness: {avg_ground:.3f} | n={len(valid)}"
        )
    else:
        print("No valid judge scores.")
In [29]:
data = [
    "Pearl Harbor is a natural harbor on the island of Oahu, Hawaii, and the site of the attack on Pearl Harbor by the Imperial Japanese Navy on December 7, 1941",
    "Hugh Grant is a British actor known for his starring roles in romantic comedy films. Born in London, England in 1960, Grant gained widespread recognition for his performances in such popular movies as Four Weddings and a Funeral (1994), Notting Hill (1999), and Bridget Jones's Diary (2001).",
    "Inception is a 2010 science fiction film directed by Christopher Nolan and starring Leonardo DiCaprio, Ellen Page, Joseph Gordon-Levitt, and Michael Caine.",
    """Bob Ross was an American painter, art instructor, and television host. He was the creator and host of the PBS television program "The Joy of Painting" from 1983 to 1994, which became a cultural phenomenon and made him a household name. """,
    """Neural networks are a type of machine learning algorithm inspired by the structure and function of the human brain. They are composed of interconnected nodes, or "neurons," that can process and transmit information. Neural networks are used in a wide variety of applications, including image recognition, natural language processing, speech recognition, and predictive analytics.""",
]

questions = [
    "Who conducted the attack on Pearl Harbor?",
    "Who is Hugh Grant and what are some of his notable films?",
    "When was Inception released and who directed it?",
    "Bob Ross was known for what television program?",
    "Neural networks are inspired by what biological system?",
]

answers = [
    "The Imperial Japanese Navy conducted the attack on Pearl Harbor.",
    "Hugh Grant is a British actor known for his roles in romantic comedy films such as Four Weddings and a Funeral, Notting Hill, and Bridget Jones's Diary.",
    "Inception was released in 2010 and was directed by Christopher Nolan.",
    "Bob Ross was known for the television program 'The Joy of Painting'.",
    "Neural networks are inspired by the structure and function of the human brain.",
]

incorrect_answers = [
    "The attack on Pearl Harbor was conducted by the Irish Navy.",
    "Hugh Grant is an Australian actor known for his roles in action films.",
    "Inception was released in 1967 and was directed by Steven Spielberg.",
    "Bob Ross was known for the television program 'The Joy of Cooking'.",
    "Neural networks are inspired by the structure and function of the human toe.",
]

qa_examples = [
    {"doc_id": i, "question": q, "answer": a}
    for i, (q, a) in enumerate(zip(questions, answers))
]


# if we want, we can automate the QA generation bit, but for now, let's use the fixed examples
# docs_for_gen = data[:use_first_n_docs]
# qa_examples = generate_qa_examples(JUDGE_MODEL, docs_for_gen, n_q_per_doc)
# print(f"Generated {len(qa_examples)} QA pairs")

retriever = SimpleRetriever.from_texts(data)

evaluations = evaluate_qa_examples(
    retriever=retriever,
    qa_examples=qa_examples,
    base_model=BASE_MODEL,
    judge_model=JUDGE_MODEL,
    top_k=TOP_K,
    separator=SEPARATOR,
)

report_on_evaluations(evaluations)
Avg correctness: 1.000 | Avg groundedness: 1.000 | n=5
In [30]:
df = pd.DataFrame(evaluations)
df["contexts"] = df["contexts"].apply(lambda x: [c["text"] for c in x])
df["correctness"] = df["judge"].apply(lambda x: x.get("correctness"))
df["groundedness"] = df["judge"].apply(lambda x: x.get("groundedness"))
df["judge_notes"] = df["judge"].apply(lambda x: x.get("notes"))
pd.options.display.max_colwidth = 150
pd.set_option("display.max_rows", None)
df
Out[30]:
question gold pred contexts judge correctness groundedness judge_notes
0 Who conducted the attack on Pearl Harbor? The Imperial Japanese Navy conducted the attack on Pearl Harbor. Sources: doc[0]\nImperial Japanese Navy conducted the attack on Pearl Harbor. [Pearl Harbor is a natural harbor on the island of Oahu, Hawaii, and the site of the attack on Pearl Harbor by the Imperial Japanese Navy on Decem... {'correctness': 1.0, 'groundedness': 1.0, 'notes': 'Exact match to the gold; context explicitly states the Imperial Japanese Navy attacked Pearl H... 1.0 1.0 Exact match to the gold; context explicitly states the Imperial Japanese Navy attacked Pearl Harbor.
1 Who is Hugh Grant and what are some of his notable films? Hugh Grant is a British actor known for his roles in romantic comedy films such as Four Weddings and a Funeral, Notting Hill, and Bridget Jones's ... Hugh Grant is a British actor known for his starring roles in romantic comedy films such as Four Weddings and a Funeral (1994), Notting Hill (1999... [Hugh Grant is a British actor known for his starring roles in romantic comedy films. Born in London, England in 1960, Grant gained widespread rec... {'correctness': 1.0, 'groundedness': 1.0, 'notes': 'Matches the gold; film years and claims are supported by the context; no hallucination.'} 1.0 1.0 Matches the gold; film years and claims are supported by the context; no hallucination.
2 When was Inception released and who directed it? Inception was released in 2010 and was directed by Christopher Nolan. Inception was released in 2010 and directed by Christopher Nolan. \n\nSources: doc[2] [Inception is a 2010 science fiction film directed by Christopher Nolan and starring Leonardo DiCaprio, Ellen Page, Joseph Gordon-Levitt, and Mich... {'correctness': 1.0, 'groundedness': 1.0, 'notes': 'Predicted answer matches the gold and is directly supported by the provided context.'} 1.0 1.0 Predicted answer matches the gold and is directly supported by the provided context.
3 Bob Ross was known for what television program? Bob Ross was known for the television program 'The Joy of Painting'. Bob Ross was known for the television program "The Joy of Painting." \n\nSources: doc[3] [Bob Ross was an American painter, art instructor, and television host. He was the creator and host of the PBS television program "The Joy of Pain... {'correctness': 1.0, 'groundedness': 1.0, 'notes': 'Predicted answer exactly matches the gold and is directly supported by the provided context.'} 1.0 1.0 Predicted answer exactly matches the gold and is directly supported by the provided context.
4 Neural networks are inspired by what biological system? Neural networks are inspired by the structure and function of the human brain. Neural networks are inspired by the structure and function of the human brain. \n\nSources: doc[4] [Neural networks are a type of machine learning algorithm inspired by the structure and function of the human brain. They are composed of intercon... {'correctness': 1.0, 'groundedness': 1.0, 'notes': 'Predicted matches the gold exactly and is directly supported by the provided context.'} 1.0 1.0 Predicted matches the gold exactly and is directly supported by the provided context.
In [31]:
qa_examples = [
    {"doc_id": i, "question": q, "answer": a, "incorrect_answer": ia}
    for i, (q, a, ia) in enumerate(zip(questions, answers, incorrect_answers))
]

evaluations = evaluate_qa_examples(
    retriever=SimpleRetriever.from_texts(data),
    qa_examples=qa_examples,
    base_model=BASE_MODEL,
    judge_model=JUDGE_MODEL,
    top_k=TOP_K,
    separator=SEPARATOR,
)

report_on_evaluations(evaluations)
Avg correctness: 0.000 | Avg groundedness: 0.000 | n=5
In [32]:
df = pd.DataFrame(evaluations)
df["contexts"] = df["contexts"].apply(lambda x: [c["text"] for c in x])
df["correctness"] = df["judge"].apply(lambda x: x.get("correctness"))
df["groundedness"] = df["judge"].apply(lambda x: x.get("groundedness"))
df["judge_notes"] = df["judge"].apply(lambda x: x.get("notes"))
pd.options.display.max_colwidth = 150
pd.set_option("display.max_rows", None)
df
Out[32]:
question gold pred contexts judge correctness groundedness judge_notes
0 Who conducted the attack on Pearl Harbor? The Imperial Japanese Navy conducted the attack on Pearl Harbor. The attack on Pearl Harbor was conducted by the Irish Navy. [Pearl Harbor is a natural harbor on the island of Oahu, Hawaii, and the site of the attack on Pearl Harbor by the Imperial Japanese Navy on Decem... {'correctness': 0.0, 'groundedness': 0.0, 'notes': 'Completely incorrect: context explicitly states the Imperial Japanese Navy conducted the attac... 0.0 0.0 Completely incorrect: context explicitly states the Imperial Japanese Navy conducted the attack; 'Irish Navy' is a hallucination.
1 Who is Hugh Grant and what are some of his notable films? Hugh Grant is a British actor known for his roles in romantic comedy films such as Four Weddings and a Funeral, Notting Hill, and Bridget Jones's ... Hugh Grant is an Australian actor known for his roles in action films. [Hugh Grant is a British actor known for his starring roles in romantic comedy films. Born in London, England in 1960, Grant gained widespread rec... {'correctness': 0.0, 'groundedness': 0.0, 'notes': 'Contradicts gold/context: Hugh Grant is British and known for romantic comedies, not an Austra... 0.0 0.0 Contradicts gold/context: Hugh Grant is British and known for romantic comedies, not an Australian action-film actor.
2 When was Inception released and who directed it? Inception was released in 2010 and was directed by Christopher Nolan. Inception was released in 1967 and was directed by Steven Spielberg. [Inception is a 2010 science fiction film directed by Christopher Nolan and starring Leonardo DiCaprio, Ellen Page, Joseph Gordon-Levitt, and Mich... {'correctness': 0.0, 'groundedness': 0.0, 'notes': 'Both release year and director are incorrect; contradicts context stating 2010 and Christopher... 0.0 0.0 Both release year and director are incorrect; contradicts context stating 2010 and Christopher Nolan.
3 Bob Ross was known for what television program? Bob Ross was known for the television program 'The Joy of Painting'. Bob Ross was known for the television program 'The Joy of Cooking'. [Bob Ross was an American painter, art instructor, and television host. He was the creator and host of the PBS television program "The Joy of Pain... {'correctness': 0.0, 'groundedness': 0.0, 'notes': 'Incorrect show name: prediction says 'The Joy of Cooking' but context states Ross hosted 'The ... 0.0 0.0 Incorrect show name: prediction says 'The Joy of Cooking' but context states Ross hosted 'The Joy of Painting'.
4 Neural networks are inspired by what biological system? Neural networks are inspired by the structure and function of the human brain. Neural networks are inspired by the structure and function of the human toe. [Neural networks are a type of machine learning algorithm inspired by the structure and function of the human brain. They are composed of intercon... {'correctness': 0.0, 'groundedness': 0.0, 'notes': 'Replaces 'brain' with 'toe'; factually incorrect and contradicts the provided context.'} 0.0 0.0 Replaces 'brain' with 'toe'; factually incorrect and contradicts the provided context.