$schema: http://json-schema.org/draft-07/schema#
$id: https://docs.514.ai/schema/experiment.v2.schema.yaml
title: AXP Experiment v2
description: >
  Machine-readable authoring contract for AXP experiment definitions (v2).
  v2 replaces v1's hand-assembled `matrix.variant[]` with independent axes
  (`agents`, `prompts`, optional `environments` / `products`) that the harness
  cross-multiplies into variants automatically, plus a recursive `extensions`
  mechanism to narrow / redirect / compose combinations. The model is a
  sub-field of the agent, so each `agents` entry pins a valid agent+model pair.
  Only agent / prompt are required (at the top level or via `extensions`). The
  resolved variant set is the cross product `agents × prompts × environments ×
  products`, with `extensions` applied afterwards.
type: object
additionalProperties: false
propertyOrder:
  - schema_version
  - id
  - name
  - description
  - agents
  - prompts
  - environments
  - products
  - extensions
  - environment_variables
  - secrets
  - files
  - tests
  - limits
required:
  - schema_version
  - id
  - name
  - tests
  - limits
properties:
  schema_version:
    description: Experiment schema version. Version 2 experiments must set this to the integer 2.
    type: integer
    const: 2
    examples: [2]
  id:
    $ref: "#/definitions/kebabId"
    description: Stable experiment identifier. This should match the file name without the .yaml extension.
    examples: [hello-world, financial-extraction]
  name:
    description: Human-readable experiment label.
    type: string
    minLength: 1
    examples: [Financial data extraction across data sources]
  description:
    description: Optional human-readable context AXP can use when analyzing experiment outcomes. Omit when unused.
    type: string
    minLength: 1
    style: literal
  agents:
    $ref: "#/definitions/agents"
    description: >
      Agent axis. One agent or a list. Each agent has a `name` selecting the ACP
      adapter binary (claude→ANTHROPIC_API_KEY, codex→OPENAI_API_KEY,
      cursor→CURSOR_API_KEY) and an optional `model` (a model id string or a
      model object). A bare string is sugar for that agent name with the
      provider-default model. Cross-multiplied with the other axes.
  prompts:
    $ref: "#/definitions/prompts"
    description: >
      Prompt axis. One prompt string, a list of strings, or a list of prompt
      objects (`{id, prompt, tags?}`). Each resolved variant runs one prompt;
      its `id` is recorded as a result dimension.
  environments:
    $ref: "#/definitions/environments"
    description: >
      Optional environment axis. Each environment describes how the agent's
      world is prepared via `setup`(s) run inside the variant sandbox before
      setup_checks and the agent. Cross-multiplied with the other axes; when
      absent, variants have no environment/setup unless an `extensions` entry adds one.
  products:
    $ref: "#/definitions/products"
    description: >
      Optional product axis: the agent-facing surface under test (CLI / MCP /
      API / …). Like environments, each product contributes `setup`(s); products
      also carry a `type`, `version`, and `commit`. When present, products are
      cross-multiplied with the other axes.
  extensions:
    $ref: "#/definitions/extensions"
    description: >
      Optional recursive refinements cross-multiplied against the base matrix:
      narrow the axes to a slice, redirect with different axis values, or append
      a prompt to a slice. When any `extensions` entry is declared, only
      extension-derived variants are emitted.
    default: []
    x-axp-resolution: >
      Extension prompts are appended to the base prompt with a blank-line
      separator (or stand alone when there is no top-level prompt); redirected
      agents/environments/products replace that axis for the subtree. Nested
      `extensions` compose. Full algebra is in resolve.rs.
  environment_variables:
    $ref: "#/definitions/environmentVariables"
    description: Environment variables to inject into every variant at runtime. Each value is a literal or an axp://secrets/<slug> reference resolved from the organization's secret store; any other axp:// value is rejected.
    default: []
  files:
    $ref: "#/definitions/files"
    description: Local host files or directories staged into every variant's /workspace over the bridge HTTP API before any `setup` runs. The path for local, uncommitted builds (CLIs, MCPs, fixtures) to enter a sandbox. A staging failure aborts the variant with status=error / exit_reason=staging_failed.
    default: []
  tests:
    $ref: "#/definitions/tests"
  limits:
    $ref: "#/definitions/limits"
examples:
  - schema_version: 2
    id: financial-extraction
    name: Financial data extraction across data sources
    prompts:
      - id: aapl
        prompt: |
          You are running one cell of a financial-data extraction benchmark.
          Produce /workspace/answer.json for AAPL (Apple Inc.)'s latest
          company-reported fiscal period.
    agents:
      - name: claude
        model: anthropic/claude-sonnet-4.6
    environments:
      - name: sec-edgar
        setup: pip install --quiet edgartools
      - name: financialdatasets-rest
        setup: pip install --quiet financialdatasets-client
    tests:
      application:
        - name: answer-is-valid-json
          script: jq -e . /workspace/answer.json
    limits:
      max_turns: 50
      max_time_seconds: 900
      max_cost_usd: 1.0
  - schema_version: 2
    id: report-build-matrix
    name: Report build across agents and prompts
    description: agent (2) × prompt (2) × environment (2) = 8 variants.
    agents:
      - name: claude
        model: anthropic/claude-opus-4.8
      - name: codex
        model: openai/gpt-5
    prompts:
      - id: terse
        prompt: Build the report.
      - id: detailed
        prompt: Build the report. Start by enumerating the data you need.
    environments:
      - name: financialdatasets
        setup: ./install-financialdatasets-mcp.sh
        tags: [data-source]
      - name: sec
        setup: ./install-sec-fixtures.sh
        tags: [data-source]
    tests:
      introspection:
        - name: under-thirty-tool-calls
          script: '[ "$(jq ".tool_calls | length" "$AXP_TRACE_PATH")" -lt 30 ]'
    limits:
      max_turns: 40
      max_time_seconds: 1200
      max_cost_usd: 2.0
definitions:
  kebabId:
    type: string
    pattern: ^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$
    minLength: 1
  tags:
    type: array
    items:
      type: string
      minLength: 1
    description: Free-form labels for grouping/filtering resolved variants in Results.
    examples:
      - [data-source]
      - ["ticker:apple", "target:sec"]
  secretName:
    type: string
    pattern: ^[A-Z_][A-Z0-9_]*$
    minLength: 1
  environmentVariable:
    title: Environment Variable
    type: object
    additionalProperties: false
    propertyOrder: [name, value]
    required: [name, value]
    properties:
      name:
        $ref: "#/definitions/secretName"
        description: Env-var name to set in the sandbox. Harness-reserved names and prefixes are rejected by axp validate.
      value:
        type: string
        description: Literal value or an axp://secrets/<slug> store reference. The axp:// prefix is reserved; any other axp:// value is rejected.
  environmentVariables:
    type: array
    uniqueItems: true
    items:
      $ref: "#/definitions/environmentVariable"
    examples:
      - - name: LOG_LEVEL
          value: debug
  agentName:
    type: string
    enum: [claude, codex, cursor]
    description: Coding agent harness the driver invokes.
  model:
    title: Model
    description: >
      The model an agent runs. With only `name`, the adapter uses provider
      defaults. `effort` / `context_window_size` / `thinking` / `fast` are
      optional controls passed to the agent (each adapter honors what it
      supports) and recorded as result dimensions.
    type: object
    additionalProperties: false
    propertyOrder: [name, effort, context_window_size, thinking, fast]
    required: [name]
    properties:
      name:
        type: string
        minLength: 1
        description: Canonical provider/model identifier (e.g. anthropic/claude-opus-4.8, openai/gpt-5).
        examples: [anthropic/claude-opus-4.8, openai/gpt-5]
      effort:
        type: string
        enum: [low, medium, high, x-high, max]
        description: Optional reasoning-effort control.
      context_window_size:
        type: string
        minLength: 1
        description: Optional context-window size hint (e.g. 1M).
        examples: ["1M", "200k"]
      thinking:
        type: boolean
        description: Enable thinking mode (if the model supports it).
      fast:
        type: boolean
        description: Enable fast mode (if the model supports it).
  modelRef:
    description: A model id string, or a model object.
    oneOf:
      - type: string
        minLength: 1
      - $ref: "#/definitions/model"
  agent:
    title: Agent
    description: An agent harness `name` plus the `model` it runs.
    type: object
    additionalProperties: false
    propertyOrder: [name, model]
    required: [name]
    properties:
      name:
        $ref: "#/definitions/agentName"
      model:
        $ref: "#/definitions/modelRef"
  agents:
    description: >
      Agent axis. A bare string is sugar for that agent name with the
      provider-default model; otherwise an agent object, or a list mixing
      strings and objects.
    oneOf:
      - $ref: "#/definitions/agentName"
      - $ref: "#/definitions/agent"
      - type: array
        minItems: 1
        items:
          oneOf:
            - $ref: "#/definitions/agentName"
            - $ref: "#/definitions/agent"
    examples:
      - claude
      - [claude, codex]
      - - name: claude
          model: anthropic/claude-opus-4.8
  prompt:
    title: Prompt
    type: object
    additionalProperties: false
    propertyOrder: [id, prompt, description, tags]
    required: [id, prompt]
    properties:
      id:
        $ref: "#/definitions/kebabId"
        description: Stable prompt identifier; recorded as a result dimension.
        examples: [aapl, terse]
      prompt:
        description: The task text given to the agent for this prompt.
        type: string
        minLength: 1
        style: literal
      description:
        type: string
        minLength: 1
        style: literal
      tags:
        $ref: "#/definitions/tags"
  prompts:
    oneOf:
      - type: string
        minLength: 1
      - type: array
        minItems: 1
        items:
          oneOf:
            - type: string
              minLength: 1
            - $ref: "#/definitions/prompt"
  setupObject:
    title: Setup
    description: >
      A first-class, reusable setup step. `script` is bash run inside the variant
      sandbox before setup_checks and the agent. A setup additionally owns the
      `files`, `environment_variables`, `mcp_servers`, and `setup_checks` it needs, so the same
      setup carries its dependencies wherever it is referenced.
    type: object
    additionalProperties: false
    propertyOrder: [name, script, description, tags, files, environment_variables, mcp_servers, setup_checks]
    required: [name, script]
    properties:
      name:
        $ref: "#/definitions/kebabId"
        examples: [python-lib, mcp]
      script:
        description: Bash run inside this variant's sandbox before setup_checks and the agent. Secret values are NOT injected here.
        type: string
        minLength: 1
        style: literal
      description:
        type: string
        minLength: 1
        style: literal
      tags:
        $ref: "#/definitions/tags"
      files:
        $ref: "#/definitions/files"
        default: []
      environment_variables:
        $ref: "#/definitions/environmentVariables"
        default: []
      mcp_servers:
        $ref: "#/definitions/mcpServers"
        default: []
      setup_checks:
        $ref: "#/definitions/setupChecks"
        default: []
  setup:
    description: >
      Setup(s) for an environment or product: a bash string, a list of bash
      strings (run sequentially), a single setup object, or a list of setup
      objects. Multiple setups run in declaration order.
    oneOf:
      - type: string
        minLength: 1
      - $ref: "#/definitions/setupObject"
      - type: array
        minItems: 1
        items:
          oneOf:
            - type: string
              minLength: 1
            - $ref: "#/definitions/setupObject"
  environment:
    title: Environment
    type: object
    additionalProperties: false
    propertyOrder: [name, setup, description, tags, commit]
    required: [name, setup]
    properties:
      name:
        $ref: "#/definitions/kebabId"
        description: Stable environment name (kebab-case); recorded as a result dimension and used as the environment coordinate.
        examples: [sec-edgar, financialdatasets-rest]
      setup:
        $ref: "#/definitions/setup"
      description:
        type: string
        minLength: 1
        style: literal
      tags:
        $ref: "#/definitions/tags"
      commit:
        type: string
        minLength: 1
        description: Optional source commit (git SHA) of the environment under test.
  environments:
    description: >
      Environment axis. A bare string is sugar for an environment whose `setup`
      is that string (positional name `e{idx}`); otherwise an environment object,
      or a list mixing strings and objects.
    oneOf:
      - type: string
        minLength: 1
      - $ref: "#/definitions/environment"
      - type: array
        minItems: 1
        items:
          oneOf:
            - type: string
              minLength: 1
            - $ref: "#/definitions/environment"
  productType:
    type: string
    enum: [CLI, MCP, API, Skill, SDK, Schema, Docs, Marketing, Agents.md, Other]
    description: What kind of agent-facing surface this product is.
  product:
    title: Product
    description: The agent-facing surface under test (CLI / MCP / API / …).
    type: object
    additionalProperties: false
    propertyOrder: [name, type, setup, version, commit, description, tags]
    required: [name, setup]
    properties:
      name:
        $ref: "#/definitions/kebabId"
        description: Stable product name (kebab-case); recorded as a result dimension and used as the product coordinate.
        examples: [clickhouse-cli]
      type:
        $ref: "#/definitions/productType"
        default: Other
      setup:
        $ref: "#/definitions/setup"
      version:
        description: >
          Optional product version under test (semver or build number). String
          only — quote numeric versions, e.g. "25" or "25.3".
        type: string
        minLength: 1
        examples: ["25.3", "1.2.0"]
      commit:
        type: string
        minLength: 1
        description: Optional source commit (git SHA) of the product under test.
      description:
        type: string
        minLength: 1
        style: literal
      tags:
        $ref: "#/definitions/tags"
  products:
    description: >
      Product axis. A bare string is sugar for a product whose `setup` is that
      string (positional name `pr{idx}`); otherwise a product object, or a list
      mixing strings and objects.
    oneOf:
      - type: string
        minLength: 1
      - $ref: "#/definitions/product"
      - type: array
        minItems: 1
        items:
          oneOf:
            - type: string
              minLength: 1
            - $ref: "#/definitions/product"
  extensions:
    type: array
    default: []
    items:
      $ref: "#/definitions/extensionItem"
  extensionItem:
    title: Extension
    description: >
      A recursive refinement of the cross product. Each extension is a node in a
      tree; walking root→leaf redirects the axes it specifies (agents,
      environments, products replace that axis for the subtree), appends its
      `prompts` text as a suffix to every base prompt (or, with no top-level
      prompt, the suffix stands alone under a synthesized promptId), and unions
      its tags. A leaf extension emits the cross product of its accumulated axes.
      When an experiment declares any `extensions`, only extension-derived
      variants are emitted. `axp experiment validate` requires extension ids to be unique
      within each `extensions` list and the resolved set to be non-empty with
      unique variant ids.
    type: object
    additionalProperties: false
    propertyOrder: [id, description, tags, agents, prompts, environments, products, extensions]
    required: [id]
    properties:
      id:
        $ref: "#/definitions/kebabId"
        examples: [financialdatasets-rest, high-complexity]
      description:
        type: string
        minLength: 1
        style: literal
      tags:
        $ref: "#/definitions/tags"
      agents:
        $ref: "#/definitions/agents"
      prompts:
        description: >
          Prompt suffix text for this extension. All entries' text is appended
          (joined by a blank line) to every inherited base prompt; entry ids are
          ignored (the base promptId is preserved). With no top-level prompt the
          suffix stands alone as the variant's prompt under a synthesized id.
        $ref: "#/definitions/prompts"
      environments:
        $ref: "#/definitions/environments"
      products:
        $ref: "#/definitions/products"
      extensions:
        type: array
        minItems: 1
        items:
          $ref: "#/definitions/extensionItem"
  tests:
    title: Tests
    description: Bash scripts that score application state and/or agent behavior. At least one test across both lists is required by axp experiment validate.
    type: object
    additionalProperties: false
    propertyOrder: [application, introspection]
    required: []
    properties:
      application:
        description: Tests that inspect resulting application state, files, commands, or endpoints.
        type: array
        default: []
        items:
          $ref: "#/definitions/test"
      introspection:
        description: Tests that inspect agent behavior through trace artifacts such as AXP_TRACE_PATH.
        type: array
        default: []
        items:
          $ref: "#/definitions/test"
    x-axp-canonical-validator: axp experiment validate enforces at least one total test and globally unique test names.
  test:
    title: Test
    type: object
    additionalProperties: false
    propertyOrder: [name, script]
    required: [name, script]
    properties:
      name:
        $ref: "#/definitions/kebabId"
        description: Stable globally unique test name within the experiment.
        examples: [dev-server-healthy, report-is-valid-json]
      script:
        description: Bash script executed by the harness. Test scripts are streamed over stdin and are not shown to the agent.
        type: string
        minLength: 1
        style: literal
        examples:
          - curl -fsS http://localhost:4000/health
  files:
    type: array
    default: []
    items:
      $ref: "#/definitions/fileStage"
    examples:
      - - name: my-cli
          source: ../build/mycli
          dest: tools/mycli
  fileStage:
    title: FileStage
    description: >
      One host→workspace staging entry. The harness tars `source` on the host
      and extracts it under `dest` (workspace-relative) inside the variant
      sandbox before `setup` runs. A directory source extracts its contents
      under `dest/`; a file source lands as a single file at `dest`.
    type: object
    additionalProperties: false
    propertyOrder: [name, source, sha256, dest]
    required: [dest]
    anyOf:
      - required: [source]
      - required: [name]
    properties:
      name:
        $ref: "#/definitions/kebabId"
        description: Stable handle for `axp local run --file NAME=HOSTPATH`. Required when `source` is omitted, optional otherwise.
        examples: [my-cli]
      source:
        type: string
        minLength: 1
        description: Host path to stage, or an `http://` / `https://` URL to a publicly fetchable artifact. Relative host paths resolve against the experiment YAML's directory.
        examples: [../build/mycli, "https://example.com/fixtures/data.bin"]
      sha256:
        type: string
        pattern: "^[0-9a-fA-F]{64}$"
        description: Optional SHA-256 (64 hex chars) of the artifact. The runner hashes the staged bytes and fails staging on mismatch. A directory source with a `sha256` is rejected.
        examples: ["e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"]
      dest:
        type: string
        minLength: 1
        description: Workspace-relative destination. Absolute paths, `..`, `.axp-bridge` components, and `::` are rejected.
        examples: [tools/mycli]
  setupChecks:
    type: array
    default: []
    items:
      $ref: "#/definitions/setupCheck"
  setupCheck:
    title: SetupCheck
    description: A preflight bash check that runs inside the variant container after `setup` but before the agent starts. Non-zero exit aborts the variant.
    type: object
    additionalProperties: false
    propertyOrder: [name, script]
    required: [name, script]
    properties:
      name:
        $ref: "#/definitions/kebabId"
        description: Stable name; appears in `setup-checks/<name>.json`.
        examples: [rust-installed, cli-on-path]
      script:
        description: Bash check streamed over stdin; not visible to the agent. Has access to declared secrets at runtime.
        type: string
        minLength: 1
        style: literal
        examples:
          - rustc --version
  mcpServers:
    type: array
    default: []
    items:
      $ref: "#/definitions/mcpServer"
    examples:
      - - name: fixture-sentinel
          type: stdio
          command: /workspace/fixture-mcp.py
          args: []
      - - name: axp
          type: http
          url: http://localhost:3001/mcp
  mcpServer:
    title: MCP Server
    description: >
      MCP server configuration passed through ACP session/new, declared on a
      Setup. `type: stdio` requires `command` (+ optional `args` / `env`);
      `type: http` / `sse` require `url` (+ optional `headers`). Stdio `env`
      entries reference experiment-declared secret names; `headers` values may
      contain `${SECRET_NAME}` placeholders. Transport-specific requirements are
      enforced by `axp experiment validate`.
    type: object
    additionalProperties: false
    propertyOrder: [name, type, command, args, url, env, headers]
    required: [name, type]
    properties:
      name:
        type: string
        minLength: 1
        description: Human-readable MCP server name. Must be unique within a setup's mcp_servers.
        examples: [fixture-sentinel, axp]
      type:
        type: string
        enum: [http, stdio, sse]
        description: MCP transport. stdio spawns a local process; http/sse connect to a URL.
      command:
        type: string
        minLength: 1
        description: Required for type=stdio. Absolute path or executable name for the MCP server process inside the sandbox.
        examples: [/workspace/fixture-mcp.py]
      args:
        type: array
        default: []
        items:
          type: string
        description: Command-line arguments passed to the stdio MCP server.
        examples:
          - [--mode, test]
      url:
        type: string
        minLength: 1
        description: Required for type=http/sse. MCP endpoint URL.
        examples: [http://localhost:3001/mcp]
      env:
        type: array
        default: []
        description: >
          Secret env vars forwarded to a stdio MCP server process. Each entry is
          either a bare secret name (sugar for `{name: NAME, from: NAME}`) or an
          explicit `{name, from}` mapping where `from` references a declared
          secret. Values are resolved at runtime; only names appear in YAML.
        items:
          $ref: "#/definitions/mcpServerStdioEnv"
        examples:
          - [GITHUB_TOKEN]
          - - GITHUB_TOKEN
            - { name: GH_AUTH, from: GITHUB_TOKEN }
      headers:
        type: array
        default: []
        description: >
          HTTP headers sent with every request to an http/sse endpoint. Header
          `value` may contain `${SECRET_NAME}` placeholders expanded at runtime
          from the variant's resolved `secrets` set.
        items:
          $ref: "#/definitions/mcpServerHeader"
        examples:
          - - { name: Authorization, value: "Bearer ${SUPABASE_SERVICE_ROLE_KEY}" }
  mcpServerStdioEnv:
    title: Stdio MCP Server Env Entry
    oneOf:
      - $ref: "#/definitions/secretName"
      - type: object
        additionalProperties: false
        propertyOrder: [name, from]
        required: [name, from]
        properties:
          name:
            type: string
            minLength: 1
            description: Env var name as seen by the MCP server process.
            examples: [GH_AUTH]
          from:
            $ref: "#/definitions/secretName"
            description: Experiment-declared secret name whose value is forwarded.
            examples: [GITHUB_TOKEN]
  mcpServerHeader:
    title: HTTP/SSE MCP Server Header
    type: object
    additionalProperties: false
    propertyOrder: [name, value]
    required: [name, value]
    properties:
      name:
        type: string
        minLength: 1
        description: HTTP header name.
        examples: [Authorization]
      value:
        type: string
        minLength: 1
        description: Header value. May contain `${SECRET_NAME}` placeholders that resolve at runtime.
        examples: ["Bearer ${SUPABASE_SERVICE_ROLE_KEY}"]
  limits:
    title: Limits
    type: object
    additionalProperties: false
    propertyOrder: [max_turns, max_time_seconds, max_cost_usd]
    required: [max_turns, max_time_seconds, max_cost_usd]
    properties:
      max_turns:
        description: Maximum agent turns. Passed to the agent command where supported.
        type: integer
        exclusiveMinimum: 0
        examples: [50]
      max_time_seconds:
        description: Wall-clock timeout in seconds enforced by the harness.
        type: integer
        exclusiveMinimum: 0
        examples: [900]
      max_cost_usd:
        description: Positive cost cap in USD. When the agent reports cumulative cost during a run and the reported cost reaches this value, the harness aborts the agent and marks the run cost_cap.
        type: number
        exclusiveMinimum: 0
        examples: [1.0]
