async def compute_uqlm_confidence(
dataframe: pd.DataFrame,
prompt_col: str = "input",
response_col: Optional[str] = None,
sampled_responses_col: Optional[str] = None,
blackbox_scorers: List[str] = ["noncontradiction"],
ensemble: str = "mean",
ensemble_weights: Optional[Dict[str, float]] = None,
risk_threshold: Optional[float] = None,
mode: str = "black_box",
llm: Optional[Any] = None,
num_responses: int = 5,
whitebox_scorers: List[str] = ["min_probability"],
verbose: bool = False,
) -> pd.DataFrame:
"""Compute per-scorer and ensemble confidence with UQLM and return merged dataframe.
Adds columns:
- uqlm_confidence [0,1]
- uqlm_risk [0,1] = 1 - confidence
- uqlm_high_risk (optional bool) if risk_threshold provided
- uqlm__conf (per-scorer, if available)
"""
if not HAVE_UQLM:
raise ImportError("UQLM is not installed. `pip install uqlm`.")
df = dataframe.copy()
per_scorer_cols = []
def _ensemble(row: Dict[str, Any]) -> float:
vals = [row[c] for c in per_scorer_cols if pd.notnull(row.get(c))]
if not vals:
return float("nan")
if ensemble == "mean":
return float(sum(vals) / len(vals))
if ensemble == "median":
s = sorted(vals)
n = len(s)
return float((s[n//2] if n % 2 else (s[n//2 - 1] + s[n//2]) / 2))
if ensemble == "weighted_mean" and ensemble_weights:
num = 0.0
den = 0.0
for c in per_scorer_cols:
sc = c.replace("uqlm_", "").replace("_conf", "")
w = float(ensemble_weights.get(sc, 0.0))
if sc in ensemble_weights and pd.notnull(row.get(c)):
num += w * float(row[c])
den += w
return float(num / den) if den > 0 else float("nan")
return float(sum(vals) / len(vals))
prompts = df[prompt_col].tolist()
responses = df[response_col].tolist() if response_col is not None and response_col in df.columns else None
sampled = df[sampled_responses_col].tolist() if sampled_responses_col is not None and sampled_responses_col in df.columns else None
if mode == "auto":
mode_to_run = "black_box"
if llm:
if hasattr(llm, "logprobs"):
mode_to_run = "white_box"
else:
mode_to_run = mode
if mode_to_run == "black_box":
bbuq = BlackBoxUQ(llm=llm, scorers=blackbox_scorers)
if responses is not None and sampled is not None:
results = bbuq.score(responses=responses, sampled_responses=sampled, show_progress_bars=False)
else:
results = await bbuq.generate_and_score(prompts=prompts, num_responses=num_responses, show_progress_bars=False)
per_scorer_cols = []
for sc_name in results.data:
if sc_name in blackbox_scorers:
per_scorer_cols.append(f"uqlm_{sc_name}_conf")
df[f"uqlm_{sc_name}_conf"] = results.data[sc_name]
elif mode_to_run == "white_box":
wbuq = WhiteBoxUQ(llm=llm, scorers=whitebox_scorers)
if verbose: print("WhiteBoxUQ.generate_and_score ...")
results = await wbuq.generate_and_score(prompts=prompts, show_progress_bars=False)
for sc_name in results.data:
if sc_name in whitebox_scorers:
per_scorer_cols.append(f"uqlm_{sc_name}_conf")
df[f"uqlm_{sc_name}_conf"] = results.data[sc_name]
else:
raise ValueError("mode must be one of {'black_box', 'white_box', 'auto'}.")
df["uqlm_confidence"] = df.apply(_ensemble, axis=1)
df["uqlm_risk"] = 1.0 - df["uqlm_confidence"]
if risk_threshold is not None:
df["uqlm_high_risk"] = df["uqlm_risk"] >= float(risk_threshold)
return df