Skip to main content

Adding New Evaluators

This guide explains how to add new evaluation methods to PatientHub.

Overview

Evaluators assess simulation quality, therapist performance, or session outcomes. PatientHub supports multiple evaluation types: rating, survey, comparison, inspection, and interview.

Architecture

patienthub/evaluators/
├── __init__.py # Evaluator registry
├── rating.py # Dimension-based rating
├── survey.py # Standardized questionnaires
├── comparison.py # A/B comparison
├── inspect.py # Qualitative analysis
├── interview.py # Interactive evaluation
└── your_evaluator.py # Your new evaluator

Step 1: Create Evaluator File

Create a new file in patienthub/evaluators/:

# patienthub/evaluators/myEvaluator.py

from typing import Any, Dict, List, Optional
from langchain_core.messages import AIMessage, HumanMessage

from patienthub.base.agents import BaseAgent


class MyEvaluator(BaseAgent):
"""
Your custom evaluation method.

Description of what this evaluator measures.
"""

def __init__(
self,
configs: Any,
lang: str = "en",
**kwargs
):
super().__init__(configs=configs, lang=lang, **kwargs)

# Load evaluation criteria
self._load_criteria()

# Initialize results storage
self.results = []

def _load_criteria(self):
"""Load evaluation criteria from config or file."""
self.criteria = getattr(self.configs, 'criteria', [])
self.scoring_scale = getattr(self.configs, 'scale', (1, 5))

def evaluate(
self,
conversation: List[Dict[str, str]],
client_profile: Optional[Dict] = None,
therapist_profile: Optional[Dict] = None,
**kwargs
) -> Dict[str, Any]:
"""
Evaluate a conversation.

Args:
conversation: List of turns [{"role": "therapist/client", "content": "..."}]
client_profile: Optional client character info
therapist_profile: Optional therapist info
**kwargs: Additional evaluation parameters

Returns:
Dictionary containing evaluation results
"""
results = {
"evaluator": "myEvaluator",
"scores": {},
"feedback": {},
"overall": None,
}

# Prepare conversation text
conv_text = self._format_conversation(conversation)

# Evaluate each criterion
for criterion in self.criteria:
score, feedback = self._evaluate_criterion(
conv_text, criterion, client_profile, therapist_profile
)
results["scores"][criterion] = score
results["feedback"][criterion] = feedback

# Calculate overall score
if results["scores"]:
results["overall"] = sum(results["scores"].values()) / len(results["scores"])

self.results.append(results)
return results

def _format_conversation(self, conversation: List[Dict]) -> str:
"""Format conversation for evaluation."""
lines = []
for turn in conversation:
role = turn.get("role", "unknown").capitalize()
content = turn.get("content", "")
lines.append(f"{role}: {content}")
return "\n\n".join(lines)

def _evaluate_criterion(
self,
conversation: str,
criterion: str,
client_profile: Optional[Dict],
therapist_profile: Optional[Dict],
) -> tuple:
"""
Evaluate a single criterion using LLM.

Returns:
(score, feedback) tuple
"""
prompt = self._build_evaluation_prompt(
conversation, criterion, client_profile, therapist_profile
)

messages = [
{"role": "system", "content": self._get_system_prompt()},
{"role": "user", "content": prompt},
]

response = self.llm.invoke(messages)

# Parse response for score and feedback
score, feedback = self._parse_response(response.content, criterion)

return score, feedback

def _get_system_prompt(self) -> str:
"""System prompt for the evaluator LLM."""
return """You are an expert evaluator assessing therapy conversations.
Provide objective, constructive assessments based on the criteria given.
Always explain your reasoning and provide specific examples from the conversation."""

def _build_evaluation_prompt(
self,
conversation: str,
criterion: str,
client_profile: Optional[Dict],
therapist_profile: Optional[Dict],
) -> str:
"""Build the evaluation prompt."""
prompt = f"""Please evaluate the following conversation on the criterion: {criterion}

Conversation:
{conversation}

"""
if client_profile:
prompt += f"""
Client Profile:
{client_profile}

"""

prompt += f"""
Rate on a scale of {self.scoring_scale[0]} to {self.scoring_scale[1]}.
Provide your rating and a brief explanation.

Format your response as:
SCORE: [number]
FEEDBACK: [your explanation]
"""
return prompt

def _parse_response(self, response: str, criterion: str) -> tuple:
"""Parse LLM response into score and feedback."""
import re

# Extract score
score_match = re.search(r'SCORE:\s*(\d+(?:\.\d+)?)', response)
score = float(score_match.group(1)) if score_match else None

# Extract feedback
feedback_match = re.search(r'FEEDBACK:\s*(.+)', response, re.DOTALL)
feedback = feedback_match.group(1).strip() if feedback_match else response

# Clamp score to valid range
if score is not None:
score = max(self.scoring_scale[0], min(self.scoring_scale[1], score))

return score, feedback

def get_summary(self) -> Dict[str, Any]:
"""Get summary statistics across all evaluations."""
if not self.results:
return {"error": "No evaluations performed"}

summary = {
"num_evaluations": len(self.results),
"average_scores": {},
"overall_average": None,
}

# Aggregate scores by criterion
all_criteria = set()
for r in self.results:
all_criteria.update(r["scores"].keys())

for criterion in all_criteria:
scores = [r["scores"].get(criterion) for r in self.results if r["scores"].get(criterion)]
if scores:
summary["average_scores"][criterion] = sum(scores) / len(scores)

# Overall average
overalls = [r["overall"] for r in self.results if r["overall"] is not None]
if overalls:
summary["overall_average"] = sum(overalls) / len(overalls)

return summary

def reset(self):
"""Reset evaluator for new evaluation batch."""
self.results = []

Step 2: Register the Evaluator

Add to patienthub/evaluators/__init__.py:

from patienthub.evaluators.myEvaluator import MyEvaluator

EVALUATOR_REGISTRY = {
# ... existing evaluators ...
'myEvaluator': MyEvaluator,
}

def get_evaluator(configs, lang: str = "en", **kwargs):
eval_type = configs.eval_type
if eval_type not in EVALUATOR_REGISTRY:
raise ValueError(f"Unknown evaluator type: {eval_type}")
return EVALUATOR_REGISTRY[eval_type](configs=configs, lang=lang, **kwargs)

Step 3: Define Evaluation Dimensions

Create dimension definitions in patienthub/evaluators/dimensions/:

# patienthub/evaluators/dimensions/myDimensions.py

MY_DIMENSIONS = {
"empathy": {
"name": "Empathy",
"description": "The degree to which the therapist demonstrates understanding of the client's emotional experience",
"indicators": [
"Reflects client's feelings accurately",
"Validates emotional experiences",
"Shows genuine concern",
],
"scale": (1, 5),
"anchors": {
1: "No empathy demonstrated",
3: "Moderate empathy with some reflection",
5: "Deep empathic understanding throughout",
}
},
"authenticity": {
"name": "Client Authenticity",
"description": "How realistic and consistent the simulated client's responses are",
"indicators": [
"Responses match character profile",
"Emotional reactions are appropriate",
"Maintains consistent personality",
],
"scale": (1, 5),
"anchors": {
1: "Responses feel artificial or inconsistent",
3: "Generally authentic with minor inconsistencies",
5: "Highly authentic and completely consistent",
}
},
}

Step 4: Create Configuration

Add configuration options:

# configs/evaluator/myEvaluator.yaml
eval_type: myEvaluator
model_type: OPENAI
model_name: gpt-4o
temperature: 0.0 # Deterministic for evaluation
max_tokens: 1024
criteria:
- empathy
- authenticity
scale: [1, 5]
target: therapist # or "client" or "both"

Step 5: Add Tests

# patienthub/tests/test_myEvaluator.py

import pytest
from omegaconf import OmegaConf
from patienthub.evaluators import get_evaluator


@pytest.fixture
def evaluator_config():
return OmegaConf.create({
'eval_type': 'myEvaluator',
'model_type': 'OPENAI',
'model_name': 'gpt-4o-mini',
'temperature': 0.0,
'max_tokens': 512,
'criteria': ['empathy', 'authenticity'],
'scale': [1, 5],
})


@pytest.fixture
def sample_conversation():
return [
{"role": "therapist", "content": "Hello, how are you feeling today?"},
{"role": "client", "content": "Not great, I've been really anxious lately."},
{"role": "therapist", "content": "I hear that you've been struggling with anxiety. That sounds difficult. Can you tell me more about what triggers it?"},
{"role": "client", "content": "Work mostly. The deadlines are overwhelming."},
]


def test_evaluator_initialization(evaluator_config):
evaluator = get_evaluator(configs=evaluator_config)
assert evaluator is not None


def test_single_evaluation(evaluator_config, sample_conversation):
evaluator = get_evaluator(configs=evaluator_config)
results = evaluator.evaluate(sample_conversation)

assert "scores" in results
assert "feedback" in results
assert "overall" in results


def test_multiple_evaluations(evaluator_config, sample_conversation):
evaluator = get_evaluator(configs=evaluator_config)

evaluator.evaluate(sample_conversation)
evaluator.evaluate(sample_conversation)

summary = evaluator.get_summary()
assert summary["num_evaluations"] == 2

Evaluator Types

Rating Evaluator

Scores conversations on predefined dimensions (1-5 scale).

Survey Evaluator

Administers standardized questionnaires (PHQ-9, GAD-7, etc.).

Comparison Evaluator

A/B comparison between two agents or methods.

Inspection Evaluator

Qualitative analysis with structured feedback.

Interview Evaluator

Interactive evaluation through follow-up questions.

Advanced Features

Turn-Level vs Session-Level

class TurnLevelEvaluator(BaseEvaluator):
def evaluate(self, conversation, **kwargs):
turn_results = []
for i in range(0, len(conversation), 2): # Each exchange
turn = conversation[i:i+2]
result = self._evaluate_turn(turn)
turn_results.append(result)

return {
"turn_results": turn_results,
"session_aggregate": self._aggregate(turn_results),
}

Multi-Rater Support

class MultiRaterEvaluator(BaseEvaluator):
def __init__(self, configs, **kwargs):
super().__init__(configs, **kwargs)
self.num_raters = getattr(configs, 'num_raters', 3)

def evaluate(self, conversation, **kwargs):
ratings = []
for _ in range(self.num_raters):
rating = self._single_evaluation(conversation)
ratings.append(rating)

return {
"individual_ratings": ratings,
"consensus": self._compute_consensus(ratings),
"agreement": self._compute_agreement(ratings),
}

Checklist

Before submitting your new evaluator:

  • Evaluator class in patienthub/evaluators/
  • Registered in __init__.py
  • Dimension definitions (if applicable)
  • Configuration file
  • Unit tests passing
  • Documentation updated
  • Example usage: python -m examples.evaluate evaluator=yourEvaluator