Batch requests are typically 50% cheaper than making requests individually. However, the requests may take up to a day to process.
Remember: LLM Foundry ALWAYS caches unless you specify Cache-Control: no-cache
.
OpenAI batch requests
Here is an example for the OpenAI Batch API.
import json
import httpx
import os
import time
# Create the dataset: a list of OpenAI request bodies wi
data = [
{"custom_id": "id1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "What is 2 + 2?"}]}},
{"custom_id": "id2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "What is 2 + 3?"}]}}
]
base_url = "https://llmfoundry.straive.com/openai/v1"
headers = {"Authorization": f"Bearer {os.getenv('LLMFOUNDRY_TOKEN')}:my-test-project"}
# Upload the JSON data as a .jsonl (JSON lines) file
file = httpx.post(
f"{base_url}/files",
headers=headers,
files={"file": ("data.jsonl", "\n".join(json.dumps(item) for item in data), "application/x-jsonlines")},
data={"purpose": "batch"}
).json()
# Create the batch
batch = httpx.post(
f"{base_url}/batches",
headers=headers,
json={
"input_file_id": file["id"],
"endpoint": "/v1/chat/completions",
"completion_window": "24h"
}
).json()
# Wait for the batch to complete. This can take several minutes.
while True:
status = httpx.get(
f"{base_url}/batches/{batch['id']}",
headers={"Cache-Control": "no-cache", **headers},
).json()
print(status["status"], status["request_counts"])
if status["status"] in {"completed", "failed", "cancelled", "expired"}:
break
time.sleep(60)
# Download the result
result = httpx.get(f"{base_url}/files/{status['output_file_id']}/content", headers=headers).content
for line in result.splitlines():
print(json.loads(line))
Anthropic Batch requests
Here is an example for the Anthropic Message Batches API.
import json
import httpx
import os
import time
# Create the dataset: a list of message requests
data = [
{"custom_id": "req1", "params": {"model": "claude-3-haiku-20240307", "max_tokens": 9, "messages": [{"role": "user", "content": "What is 2 + 2?"}]}},
{"custom_id": "req2", "params": {"model": "claude-3-haiku-20240307", "max_tokens": 9, "messages": [{"role": "user", "content": "What is 2 + 3?"}]}}
]
base_url = "https://llmfoundry.straive.com/anthropic/v1"
headers = {
"Authorization": f"Bearer {os.getenv('LLMFOUNDRY_TOKEN')}:my-test-project",
"Anthropic-Version": "2023-06-01",
"Anthropic-Beta": "message-batches-2024-09-24",
"Content-Type": "application/json",
}
# Create the batch
batch = httpx.post(f"{base_url}/messages/batches", headers=headers, json={"requests": data}).json()
# Wait for batch completion (up to 24 hours)
while True:
status = httpx.get(
f"{base_url}/messages/batches/{batch['id']}",
headers={"Cache-Control": "no-cache", **headers},
).json()
print(status["processing_status"], status["request_counts"])
if "result_type" in status:
break
time.sleep(60)
# Download the result
result = httpx.get(status["results_url"], headers=headers).content
for line in result.splitlines():
print(json.loads(line))
Key features:
- 50% cheaper than standard API pricing
- Supports up to 10,000 requests or 32MB per batch
- Results available for 29 days after creation
- Supports all Claude features (vision, tool use, system messages)
- Available for Claude 3 models (Opus, Sonnet, Haiku)
- Requires beta header:
anthropic-beta: message-batches-2024-09-24
Gemini batch requests
Gemini batch text generation requires uploading files to Google Cloud Storage, which is not supported by LLM Foundry.