Batch requests are typically 50% cheaper than making requests individually. However, the requests may take up to a day to process.

Remember: LLM Foundry ALWAYS caches unless you specify Cache-Control: no-cache.

OpenAI batch requests

Here is an example for the OpenAI Batch API.

import json
import httpx
import os
import time

# Create the dataset: a list of OpenAI request bodies wi
data = [
    {"custom_id": "id1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "What is 2 + 2?"}]}},
    {"custom_id": "id2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "What is 2 + 3?"}]}}
]

base_url = "https://llmfoundry.straive.com/openai/v1"
headers = {"Authorization": f"Bearer {os.getenv('LLMFOUNDRY_TOKEN')}:my-test-project"}

# Upload the JSON data as a .jsonl (JSON lines) file
file = httpx.post(
    f"{base_url}/files",
    headers=headers,
    files={"file": ("data.jsonl", "\n".join(json.dumps(item) for item in data), "application/x-jsonlines")},
    data={"purpose": "batch"}
).json()

# Create the batch
batch = httpx.post(
    f"{base_url}/batches",
    headers=headers,
    json={
      "input_file_id": file["id"],
      "endpoint": "/v1/chat/completions",
      "completion_window": "24h"
    }
).json()

# Wait for the batch to complete. This can take several minutes.
while True:
    status = httpx.get(
        f"{base_url}/batches/{batch['id']}",
        headers={"Cache-Control": "no-cache", **headers},
    ).json()
    print(status["status"], status["request_counts"])
    if status["status"] in {"completed", "failed", "cancelled", "expired"}:
        break
    time.sleep(60)

# Download the result
result = httpx.get(f"{base_url}/files/{status['output_file_id']}/content", headers=headers).content
for line in result.splitlines():
    print(json.loads(line))

Anthropic Batch requests

Here is an example for the Anthropic Message Batches API.

import json
import httpx
import os
import time

# Create the dataset: a list of message requests
data = [
    {"custom_id": "req1", "params": {"model": "claude-3-haiku-20240307", "max_tokens": 9, "messages": [{"role": "user", "content": "What is 2 + 2?"}]}},
    {"custom_id": "req2", "params": {"model": "claude-3-haiku-20240307", "max_tokens": 9, "messages": [{"role": "user", "content": "What is 2 + 3?"}]}}
]

base_url = "https://llmfoundry.straive.com/anthropic/v1"
headers = {
    "Authorization": f"Bearer {os.getenv('LLMFOUNDRY_TOKEN')}:my-test-project",
    "Anthropic-Version": "2023-06-01",
    "Anthropic-Beta": "message-batches-2024-09-24",
    "Content-Type": "application/json",
}

# Create the batch
batch = httpx.post(f"{base_url}/messages/batches", headers=headers, json={"requests": data}).json()

# Wait for batch completion (up to 24 hours)
while True:
    status = httpx.get(
        f"{base_url}/messages/batches/{batch['id']}",
        headers={"Cache-Control": "no-cache", **headers},
    ).json()
    print(status["processing_status"], status["request_counts"])
    if "result_type" in status:
        break
    time.sleep(60)

# Download the result
result = httpx.get(status["results_url"], headers=headers).content
for line in result.splitlines():
    print(json.loads(line))

Key features:

50% cheaper than standard API pricing
Supports up to 10,000 requests or 32MB per batch
Results available for 29 days after creation
Supports all Claude features (vision, tool use, system messages)
Available for Claude 3 models (Opus, Sonnet, Haiku)
Requires beta header: anthropic-beta: message-batches-2024-09-24

Gemini batch requests

Gemini batch text generation requires uploading files to Google Cloud Storage, which is not supported by LLM Foundry.