This guide shows how to load test local Ollama AI models. Perfect for testing your self-hosted AI setup and comparing local vs cloud AI performance.

Use Cases

  • Test local Ollama model performance
  • Compare different local AI models
  • Validate self-hosted AI reliability
  • Measure local AI response times

Simple Implementation

from locust import task, HttpUser
import json
import random

class OllamaUser(HttpUser):
    def on_start(self):
        # Ollama typically runs on localhost:11434
        self.base_url = "http://localhost:11434"
        
        # Available models (install with: ollama pull model-name)
        self.models = [
            "llama2:7b",
            "mistral:7b", 
            "codellama:7b",
            "phi:2.7b"
        ]
        
        # Test prompts
        self.prompts = [
            "Write a short product description for a smartphone.",
            "Explain machine learning in simple terms.",
            "Create a brief email to schedule a meeting.",
            "Write a Python function to calculate fibonacci numbers.",
            "Describe the benefits of renewable energy."
        ]

    @task(3)
    def generate_text(self):
        """Generate text using Ollama"""
        model = random.choice(self.models)
        prompt = random.choice(self.prompts)
        
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": 0.7,
                "num_predict": 100
            }
        }
        
        with self.client.post(
            f"{self.base_url}/api/generate",
            json=payload,
            name=f"Generate - {model}"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    response_text = data.get("response", "")
                    done = data.get("done", False)
                    
                    if done and response_text:
                        print(f"{model}: Generated {len(response_text)} characters")
                    else:
                        response.failure("Incomplete response from Ollama")
                        
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Ollama error: {response.status_code}")

    @task(2)
    def chat_completion(self):
        """Test chat completion with Ollama"""
        model = random.choice(self.models)
        prompt = random.choice(self.prompts)
        
        payload = {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "stream": False,
            "options": {
                "temperature": 0.7,
                "num_predict": 100
            }
        }
        
        with self.client.post(
            f"{self.base_url}/api/chat",
            json=payload,
            name=f"Chat - {model}"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    message = data.get("message", {})
                    content = message.get("content", "")
                    done = data.get("done", False)
                    
                    if done and content:
                        print(f"{model} Chat: {len(content)} characters")
                    else:
                        response.failure("Incomplete chat response")
                        
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Ollama chat error: {response.status_code}")

    @task(1)
    def list_models(self):
        """List available models"""
        with self.client.get(
            f"{self.base_url}/api/tags",
            name="List Models"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    models = data.get("models", [])
                    print(f"Available models: {len(models)}")
                    
                    for model in models[:3]:  # Show first 3
                        name = model.get("name", "Unknown")
                        size = model.get("size", 0) / (1024**3)  # Convert to GB
                        print(f"  - {name}: {size:.1f}GB")
                        
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Failed to list models: {response.status_code}")

    @task(1)
    def model_info(self):
        """Get information about a specific model"""
        model = random.choice(self.models)
        
        payload = {"name": model}
        
        with self.client.post(
            f"{self.base_url}/api/show",
            json=payload,
            name=f"Model Info - {model}"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    modelfile = data.get("modelfile", "")
                    parameters = data.get("parameters", "")
                    print(f"{model} info retrieved")
                    
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Failed to get model info: {response.status_code}")

Setup Instructions

  1. Install Ollama: Download from ollama.ai
  2. Pull Models: Install models you want to test: 
    ollama pull llama2:7b
ollama pull mistral:7b
ollama pull phi:2.7b
  3. Start Ollama: Run ollama serve (usually starts automatically)
  4. Verify Setup: Test with curl http://localhost:11434/api/tags

What This Tests

  • Local AI Performance: Measures response times for local models
  • Model Comparison: Compare different models on same hardware
  • Resource Usage: Monitor CPU/GPU usage during testing
  • Reliability: Test local AI stability under load

Performance Tips

  • GPU Acceleration: Use NVIDIA GPU for faster inference
  • Model Size: Smaller models (7B) are faster than larger ones (13B, 70B)
  • Memory: Ensure sufficient RAM for model loading
  • Concurrent Users: Start with low numbers to avoid overwhelming local hardware

Common Issues

  • Model Not Found: Ensure models are pulled with ollama pull
  • Connection Refused: Check if Ollama service is running
  • Slow Responses: Local models are slower than cloud APIs
  • Memory Issues: Large models require significant RAM/VRAM
