All pages
Powered by GitBook
1 of 1

Loading...

Quickstart: Datasets & Experiments

Phoenix helps you run experiments over your AI and LLM applications to evaluate and iteratively improve their performance. This quickstart shows you how to get up and running quickly.

Launch Phoenix

Python

TypeScript

Datasets

Upload a dataset.

import pandas as pd
import phoenix as px

df = pd.DataFrame(
    [
        {
            "question": "What is Paul Graham known for?",
            "answer": "Co-founding Y Combinator and writing on startups and techology.",
            "metadata": {"topic": "tech"},
        }
    ]
)
phoenix_client = px.Client()
dataset = phoenix_client.upload_dataset(
    dataframe=df,
    dataset_name="test-dataset",
    input_keys=["question"],
    output_keys=["answer"],
    metadata_keys=["metadata"],
)
import { createClient } from "@arizeai/phoenix-client";
import { createDataset } from "@arizeai/phoenix-client/datasets";

// Create example data
const examples = [
  {
    input: { question: "What is Paul Graham known for?" },
    output: {
      answer: "Co-founding Y Combinator and writing on startups and techology."
    },
    metadata: { topic: "tech" }
  }
];

// Initialize Phoenix client
const client = createClient();

// Upload dataset
const { datasetId } = await createDataset({
  client,
  name: "test-dataset",
  examples: examples
});

Tasks

Create a task to evaluate.

from openai import OpenAI
from phoenix.experiments.types import Example

openai_client = OpenAI()

task_prompt_template = "Answer in a few words: {question}"


def task(example: Example) -> str:
    question = example.input["question"]
    message_content = task_prompt_template.format(question=question)
    response = openai_client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": message_content}]
    )
    return response.choices[0].message.content
import { OpenAI } from "openai";
import { type RunExperimentParams } from "@arizeai/phoenix-client/experiments";

// Initialize OpenAI client
const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

const taskPromptTemplate = "Answer in a few words: {question}";

const task: RunExperimentParams["task"] = async (example) => {
  // Access question with type assertion
  const question = example.input.question || "No question provided";
  const messageContent = taskPromptTemplate.replace("{question}", question);

  const response = await openai.chat.completions.create({
    model: "gpt-4o", 
    messages: [{ role: "user", content: messageContent }]
  });

  return response.choices[0]?.message?.content || "";
};

Evaluators

Use pre-built evaluators to grade task output with code...

from phoenix.experiments.evaluators import ContainsAnyKeyword

contains_keyword = ContainsAnyKeyword(keywords=["Y Combinator", "YC"])
import { asEvaluator } from "@arizeai/phoenix-client/experiments";

// Code-based evaluator that checks if response contains specific keywords
const containsKeyword = asEvaluator({
  name: "contains_keyword",
  kind: "CODE",
  evaluate: async ({ output }) => {
    const keywords = ["Y Combinator", "YC"];
    const outputStr = String(output).toLowerCase();
    const contains = keywords.some((keyword) =>
      outputStr.toLowerCase().includes(keyword.toLowerCase())
    );

    return {
      score: contains ? 1.0 : 0.0,
      label: contains ? "contains_keyword" : "missing_keyword",
      metadata: { keywords },
      explanation: contains
        ? `Output contains one of the keywords: ${keywords.join(", ")}`
        : `Output does not contain any of the keywords: ${keywords.join(", ")}`
    };
  }
});

or LLMs.

from phoenix.experiments.evaluators import ConcisenessEvaluator
from phoenix.evals.models import OpenAIModel

model = OpenAIModel(model="gpt-4o")
conciseness = ConcisenessEvaluator(model=model)
import { asEvaluator } from "@arizeai/phoenix-client/experiments";
import { OpenAI } from "openai";

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

// LLM-based evaluator for conciseness
const conciseness = asEvaluator({
  name: "conciseness",
  kind: "LLM",
  evaluate: async ({ output }) => {
    const prompt = `
      Rate the following text on a scale of 0.0 to 1.0 for conciseness (where 1.0 is perfectly concise).
      
      TEXT: ${output}
      
      Return only a number between 0.0 and 1.0.
    `;

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      messages: [{ role: "user", content: prompt }]
    });

    const scoreText = response.choices[0]?.message?.content?.trim() || "0";
    const score = parseFloat(scoreText);

    return {
      score: isNaN(score) ? 0.5 : score,
      label: score > 0.7 ? "concise" : "verbose",
      metadata: {},
      explanation: `Conciseness score: ${score}`
    };
  }
});

Define custom evaluators with code...

from typing import Any, Dict


def jaccard_similarity(output: str, expected: Dict[str, Any]) -> float:
    # https://en.wikipedia.org/wiki/Jaccard_index
    actual_words = set(output.lower().split(" "))
    expected_words = set(expected["answer"].lower().split(" "))
    words_in_common = actual_words.intersection(expected_words)
    all_words = actual_words.union(expected_words)
    return len(words_in_common) / len(all_words)
import { asEvaluator } from "@arizeai/phoenix-client/experiments";

// Custom Jaccard similarity evaluator
const jaccardSimilarity = asEvaluator({
  name: "jaccard_similarity",
  kind: "CODE",
  evaluate: async ({ output, expected }) => {
    const actualWords = new Set(String(output).toLowerCase().split(" "));
    const expectedAnswer = expected?.answer || "";
    const expectedWords = new Set(expectedAnswer.toLowerCase().split(" "));

    const wordsInCommon = new Set(
      [...actualWords].filter((word) => expectedWords.has(word))
    );

    const allWords = new Set([...actualWords, ...expectedWords]);
    const score = wordsInCommon.size / allWords.size;

    return {
      score,
      label: score > 0.5 ? "similar" : "dissimilar",
      metadata: {
        actualWordsCount: actualWords.size,
        expectedWordsCount: expectedWords.size,
        commonWordsCount: wordsInCommon.size,
        allWordsCount: allWords.size
      },
      explanation: `Jaccard similarity: ${score}`
    };
  }
});

or LLMs.

from phoenix.experiments.evaluators import create_evaluator
from typing import Any, Dict

eval_prompt_template = """
Given the QUESTION and REFERENCE_ANSWER, determine whether the ANSWER is accurate.
Output only a single word (accurate or inaccurate).

QUESTION: {question}

REFERENCE_ANSWER: {reference_answer}

ANSWER: {answer}

ACCURACY (accurate / inaccurate):
"""


@create_evaluator(kind="llm")  # need the decorator or the kind will default to "code"
def accuracy(input: Dict[str, Any], output: str, expected: Dict[str, Any]) -> float:
    message_content = eval_prompt_template.format(
        question=input["question"], reference_answer=expected["answer"], answer=output
    )
    response = openai_client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": message_content}]
    )
    response_message_content = response.choices[0].message.content.lower().strip()
    return 1.0 if response_message_content == "accurate" else 0.0
import { asEvaluator } from "@arizeai/phoenix-client/experiments";
import { OpenAI } from "openai";

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

// LLM-based accuracy evaluator
const accuracy = asEvaluator({
  name: "accuracy",
  kind: "LLM",
  evaluate: async ({ input, output, expected }) => {
    const question = input.question || "No question provided";
    const referenceAnswer = expected?.answer || "No reference answer provided";

    const evalPromptTemplate = `
      Given the QUESTION and REFERENCE_ANSWER, determine whether the ANSWER is accurate.
      Output only a single word (accurate or inaccurate).
      
      QUESTION: {question}
      
      REFERENCE_ANSWER: {reference_answer}
      
      ANSWER: {answer}
      
      ACCURACY (accurate / inaccurate):
    `;

    const messageContent = evalPromptTemplate
      .replace("{question}", question)
      .replace("{reference_answer}", referenceAnswer)
      .replace("{answer}", String(output));

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      messages: [{ role: "user", content: messageContent }]
    });

    const responseContent = 
      response.choices[0]?.message?.content?.toLowerCase().trim() || "";
    const isAccurate = responseContent === "accurate";

    return {
      score: isAccurate ? 1.0 : 0.0,
      label: isAccurate ? "accurate" : "inaccurate",
      metadata: {},
      explanation: `LLM determined the answer is ${isAccurate ? "accurate" : "inaccurate"}`
    };
  }
});

Experiments

Run an experiment and evaluate the results.

from phoenix.experiments import run_experiment

experiment = run_experiment(
    dataset,
    task,
    experiment_name="initial-experiment",
    evaluators=[jaccard_similarity, accuracy],
)
import { runExperiment } from "@arizeai/phoenix-client/experiments";

// Run the experiment with selected evaluators
const experiment = await runExperiment({
  client,
  experimentName: "initial-experiment",
  dataset: { datasetId }, // Use the dataset ID from earlier
  task,
  evaluators: [jaccardSimilarity, accuracy]
});

console.log("Initial experiment completed with ID:", experiment.id);

Run more evaluators after the fact.

from phoenix.experiments import evaluate_experiment

experiment = evaluate_experiment(experiment, evaluators=[contains_keyword, conciseness])
import { evaluateExperiment } from "@arizeai/phoenix-client/experiments";

// Add more evaluations to an existing experiment
const updatedEvaluation = await evaluateExperiment({
  client,
  experiment, // Use the existing experiment object
  evaluators: [containsKeyword, conciseness]
});

console.log("Additional evaluations completed for experiment:", experiment.id);

And iterate 🚀

Dry Run

Sometimes we may want to do a quick sanity check on the task function or the evaluators before unleashing them on the full dataset. run_experiment() and evaluate_experiment() both are equipped with a dry_run= parameter for this purpose: it executes the task and evaluators on a small subset without sending data to the Phoenix server. Setting dry_run=True selects one sample from the dataset, and setting it to a number, e.g. dry_run=3, selects multiple. The sampling is also deterministic, so you can keep re-running it for debugging purposes.

  1. Sign up for an Arize Phoenix account at https://app.phoenix.arize.com/login

  2. Grab your API key from the Keys option on the left bar.

  3. In your code, configure environment variables for your endpoint and API key:

  1. Run Phoenix using Docker, local terminal, Kubernetes etc. For more information, see self-hosting.

  2. In your code, configure environment variables for your endpoint and API key:

Sign up for Phoenix:

  1. Sign up for an Arize Phoenix account at https://app.phoenix.arize.com/login

  2. Click Create Space, then follow the prompts to create and launch your space.

Install packages:

pip install arize-phoenix-otel

Set your Phoenix endpoint and API Key:

From your new Phoenix Space

  1. Create your API key from the Settings page

  2. Copy your Hostname from the Settings page

  3. In your code, set your endpoint and API key:

import os

os.environ["PHOENIX_API_KEY"] = "ADD YOUR PHOENIX API KEY"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "ADD YOUR PHOENIX HOSTNAME"

# If you created your Phoenix Cloud instance before June 24th, 2025,
# you also need to set the API key as a header
#os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={os.getenv('PHOENIX_API_KEY')}"

Having trouble finding your endpoint? Check out Finding your Phoenix Endpoint

  1. Run Phoenix using Docker, local terminal, Kubernetes etc. For more information, see self-hosting.

  2. In your code, set your endpoint:

import os

# Update this with your self-hosted endpoint
os.environ["PHOENIX_COLLECTOR_ENDPOINT] = "http://localhost:6006/v1/traces"

Having trouble finding your endpoint? Check out Finding your Phoenix Endpoint

# .env, or shell environment

# Add Phoenix API Key for tracing
PHOENIX_API_KEY="ADD YOUR PHOENIX API KEY"
# And Collector Endpoint for Phoenix Cloud
PHOENIX_COLLECTOR_ENDPOINT="ADD YOUR PHOENIX HOSTNAME"
# .env, or shell environment

# Collector Endpoint for your self hosted Phoenix, like localhost
PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006"
# (optional) If authentication enabled, add Phoenix API Key for tracing
PHOENIX_API_KEY="ADD YOUR API KEY"
Background + demo on datasets