Files API

The client.files lets you upload, retrieve, and manage files used by the VLM Run platform. Files are essential for predictions, fine-tuning, and dataset creation.

Quick Examples

Upload a File

# Upload a file
file = client.files.upload("invoice.jpg")
print(f"File ID: {file.id}, URL: {file.url}")

Retrieve a File

# Get file details
file = client.files.get("file_abc123")
print(f"Name: {file.filename}, Size: {file.bytes} bytes")

List Files

# List your files
files = client.files.list(limit=10)
for file in files:
    print(f"{file.filename} ({file.id})")

Delete a File

# Delete a file
client.files.delete("file_abc123")

File Lifecycle

Files in VLM Run follow a simple lifecycle:

  1. Upload - Send files to the platform
  2. Process - Use files for predictions or other operations
  3. Manage - List, retrieve, or delete files as needed

Uploading Files

Basic Upload

# Upload a file with default settings
file = client.files.upload("document.pdf")

With Purpose

Files can be categorized by purpose, which affects how they can be used:

# Upload for fine-tuning
file = client.files.upload(
    file="training_data.json",
    purpose="fine-tune"
)

From File Object

# Upload from an open file handle
with open("document.pdf", "rb") as f:
    file = client.files.upload(file=f, filename="document.pdf")

Available Purposes

PurposeDescriptionCommon File Types
fine-tuneFor fine-tuning modelsTraining data, JSON files
assistantsGeneral usage (default)Images, PDFs, text files
assistants_outputOutput from assistantsGenerated content
batchInput files for batch processingLarge collections
batch_outputOutput from batch processingResults and reports
visionFor vision-based modelsImages, screenshots
datasetsFor dataset creationLabeled data collections

Retrieving Files

Get File Details

# Get metadata for a specific file
file = client.files.get("file_abc123")

# Access file properties
print(f"Name: {file.filename}")
print(f"Size: {file.bytes} bytes")
print(f"Purpose: {file.purpose}")
print(f"Created: {file.created_at}")

List Files

# List all files (paginated)
files = client.files.list(limit=20)

# Filter by purpose
fine_tune_files = client.files.list(
    purpose="fine-tune",
    limit=10
)

# Pagination
next_page = client.files.list(skip=20, limit=20)

Delete Files

# Delete a file by ID
client.files.delete("file_abc123")

Common Patterns

Upload and Process

The most common pattern is uploading a file and using it immediately:

# 1. Upload the file
file = client.files.upload("invoice.jpg")

# 2. Process with image API
prediction = client.image.generate(
    urls=[file.url],
    domain="document.invoice"
)

# 3. Work with the results
if prediction.status == "completed":
    print(f"Invoice #: {prediction.response.invoice_number}")
    print(f"Amount: ${prediction.response.total_amount}")

Batch Processing Multiple Files

# Upload multiple files
import glob

# Get all PDFs in a directory
pdf_files = glob.glob("./invoices/*.pdf")
results = []

# Process each file
for path in pdf_files:
    # 1. Upload file
    file = client.files.upload(path, purpose="batch")

    # 2. Process document
    prediction = client.document.generate(
        urls=[file.url],
        domain="document.invoice",
        batch=True  # Process asynchronously
    )

    results.append((file.id, prediction.id))

# Later: check results
for file_id, prediction_id in results:
    prediction = client.predictions.get(prediction_id)
    if prediction.status == "completed":
        print(f"File {file_id}: ${prediction.response.total_amount}")

Temporary File Management

Clean up files after use:

# Upload a temporary file
temp_file = client.files.upload("temp_image.jpg")

try:
    # Use the file
    result = client.image.generate(
        urls=[temp_file.url],
        domain="image.classification"
    )

    # Process the result
    print(f"Classification: {result.response}")

finally:
    # Clean up when done
    client.files.delete(temp_file.id)

Optimization Features

File Caching

VLM Run automatically detects duplicate files using content hashing:

# Upload same file twice
file1 = client.files.upload("report.pdf")
file2 = client.files.upload("report.pdf")

# Both operations return the same file ID
print(f"Same file ID: {file1.id == file2.id}")  # True

Pre-upload Caching Check

Check if a file exists before uploading:

# Check if file exists in cache
cached_file = client.files.get_cached_file("large-dataset.zip")

if cached_file:
    file_id = cached_file.id
    print(f"Using existing file: {file_id}")
else:
    file = client.files.upload("large-dataset.zip")
    file_id = file.id
    print(f"Uploaded new file: {file_id}")

Response Structure

The FileResponse object has the following structure:

class FileResponse(BaseModel):
    id: Optional[str]          # Unique file identifier
    filename: Optional[str]    # Original filename
    bytes: int                 # File size in bytes
    purpose: Literal[          # File purpose/category
        "fine-tune",
        "assistants",
        "assistants_output",
        "batch",
        "batch_output",
        "vision",
        "datasets",
    ]
    created_at: datetime       # Creation timestamp
    object: str = "file"       # Object type

Example usage:

file = client.files.get("file_abc123")

print(f"ID: {file.id}")
print(f"Filename: {file.filename}")
print(f"Size: {file.bytes} bytes")
print(f"Purpose: {file.purpose}")
print(f"Created: {file.created_at}")
print(f"Object type: {file.object}")  # Always "file"

Error Handling

from vlmrun.client.exceptions import ApiError, NotFoundError

try:
    file = client.files.upload("document.pdf")
except FileNotFoundError:
    print("Local file not found")
except ApiError as e:
    if e.status_code == 413:
        print("File too large")
    elif e.status_code == 415:
        print("Unsupported file type")
    else:
        print(f"API error: {e.message}")

Best Practices

Use Descriptive Filenames

# ✅ Good: Descriptive name
file = client.files.upload("invoice-march2023-acme-corp.jpg")

# ❌ Bad: Generic name
file = client.files.upload("file.jpg")

Check Cache for Efficiency

# Efficiently upload multiple files
for path in files_to_upload:
    cached = client.files.get_cached_file(path)
    if cached:
        file_id = cached.id  # Use existing file
    else:
        file = client.files.upload(path)
        file_id = file.id

Clean Up Unused Files

# List files older than 30 days
import datetime
from datetime import timedelta

cutoff_date = datetime.datetime.now() - timedelta(days=30)
old_files = []

for file in client.files.list():
    if file.created_at < cutoff_date and file.purpose == "assistants":
        old_files.append(file.id)

# Delete old files
for file_id in old_files:
    client.files.delete(file_id)
    print(f"Deleted old file: {file_id}")

Set Appropriate Timeouts for Large Files

# Extend timeout for large files
large_file = client.files.upload(
    file="large-video.mp4",
    timeout=600  # 10 minutes
)