1. Setup & Configuration¶
In [22]:
Copied!
import os
import subprocess
import requests
# === Configuration ===
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434")
DEFAULT_MODEL = "llama3.2:latest"
print(f"Ollama host: {OLLAMA_HOST}")
print(f"Default model: {DEFAULT_MODEL}")
import os import subprocess import requests # === Configuration === OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434") DEFAULT_MODEL = "llama3.2:latest" print(f"Ollama host: {OLLAMA_HOST}") print(f"Default model: {DEFAULT_MODEL}")
Out[22]:
Ollama host: http://ollama:11434 Default model: llama3.2:latest
2. Connection Health Check¶
In [23]:
Copied!
def check_ollama_health() -> tuple[bool, bool]:
"""Check if Ollama server is running and model is available.
Returns:
tuple: (server_healthy, model_available)
"""
try:
response = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=5)
if response.status_code == 200:
print("✓ Ollama server is running!")
models = response.json()
model_names = [m.get("name", "") for m in models.get("models", [])]
if DEFAULT_MODEL in model_names:
print(f"✓ Model '{DEFAULT_MODEL}' is available")
return True, True
else:
print(f"✗ Model '{DEFAULT_MODEL}' not found!")
print()
if model_names:
print("Available models:")
for name in model_names:
print(f" - {name}")
else:
print("No models installed.")
print()
print("To fix this, run:")
print(f" ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
return True, False
else:
print(f"Ollama returned unexpected status: {response.status_code}")
return False, False
except requests.exceptions.ConnectionError:
print("✗ Cannot connect to Ollama server!")
print()
print("To fix this, run:")
print(" ujust ollama start")
return False, False
except requests.exceptions.Timeout:
print("✗ Connection to Ollama timed out!")
return False, False
ollama_healthy, model_available = check_ollama_health()
def check_ollama_health() -> tuple[bool, bool]: """Check if Ollama server is running and model is available. Returns: tuple: (server_healthy, model_available) """ try: response = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=5) if response.status_code == 200: print("✓ Ollama server is running!") models = response.json() model_names = [m.get("name", "") for m in models.get("models", [])] if DEFAULT_MODEL in model_names: print(f"✓ Model '{DEFAULT_MODEL}' is available") return True, True else: print(f"✗ Model '{DEFAULT_MODEL}' not found!") print() if model_names: print("Available models:") for name in model_names: print(f" - {name}") else: print("No models installed.") print() print("To fix this, run:") print(f" ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") return True, False else: print(f"Ollama returned unexpected status: {response.status_code}") return False, False except requests.exceptions.ConnectionError: print("✗ Cannot connect to Ollama server!") print() print("To fix this, run:") print(" ujust ollama start") return False, False except requests.exceptions.Timeout: print("✗ Connection to Ollama timed out!") return False, False ollama_healthy, model_available = check_ollama_health()
Out[23]:
✓ Ollama server is running! ✓ Model 'llama3.2:latest' is available
3. GPU Status Check¶
In [24]:
Copied!
print("=== GPU Status ===")
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name,memory.used,memory.total,utilization.gpu", "--format=csv,noheader,nounits"],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
lines = result.stdout.strip().split("\n")
for i, line in enumerate(lines):
parts = line.split(", ")
if len(parts) >= 4:
name, mem_used, mem_total, util = parts
print(f"GPU {i}: {name}")
print(f" Memory: {mem_used} MB / {mem_total} MB")
print(f" Utilization: {util}%")
else:
print("nvidia-smi returned an error")
print(result.stderr)
except FileNotFoundError:
print("nvidia-smi not found - NVIDIA GPU may not be available")
except subprocess.TimeoutExpired:
print("nvidia-smi timed out")
except Exception as e:
print(f"Error checking GPU: {e}")
print("=== GPU Status ===") try: result = subprocess.run( ["nvidia-smi", "--query-gpu=name,memory.used,memory.total,utilization.gpu", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: lines = result.stdout.strip().split("\n") for i, line in enumerate(lines): parts = line.split(", ") if len(parts) >= 4: name, mem_used, mem_total, util = parts print(f"GPU {i}: {name}") print(f" Memory: {mem_used} MB / {mem_total} MB") print(f" Utilization: {util}%") else: print("nvidia-smi returned an error") print(result.stderr) except FileNotFoundError: print("nvidia-smi not found - NVIDIA GPU may not be available") except subprocess.TimeoutExpired: print("nvidia-smi timed out") except Exception as e: print(f"Error checking GPU: {e}")
Out[24]:
=== GPU Status === GPU 0: NVIDIA GeForce RTX 4080 SUPER Memory: 2139 MB / 16376 MB Utilization: 5%
4. GPU Usage During Inference¶
In [25]:
Copied!
print("=== GPU Usage During Inference ===")
if not model_available:
print()
print("⚠ Skipping inference test - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
print("Running inference and checking GPU metrics...")
print()
# Run a generation to load the model
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": DEFAULT_MODEL,
"prompt": "Write a haiku about computers.",
"stream": False
}
)
result = response.json()
if "error" in result:
print(f"✗ Error: {result['error']}")
else:
print(f"Response: {result['response']}")
print()
# Check Ollama's reported metrics
print("Ollama Inference Metrics:")
print(f" Prompt eval count: {result.get('prompt_eval_count', 'N/A')}")
print(f" Prompt eval duration: {result.get('prompt_eval_duration', 0) / 1e9:.3f}s")
print(f" Eval count (tokens generated): {result.get('eval_count', 'N/A')}")
print(f" Eval duration: {result.get('eval_duration', 0) / 1e9:.3f}s")
print(f" Total duration: {result.get('total_duration', 0) / 1e9:.3f}s")
if result.get('eval_count') and result.get('eval_duration'):
tokens_per_sec = result['eval_count'] / (result['eval_duration'] / 1e9)
print(f" Tokens/second: {tokens_per_sec:.1f}")
print("=== GPU Usage During Inference ===") if not model_available: print() print("⚠ Skipping inference test - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: print("Running inference and checking GPU metrics...") print() # Run a generation to load the model response = requests.post( f"{OLLAMA_HOST}/api/generate", json={ "model": DEFAULT_MODEL, "prompt": "Write a haiku about computers.", "stream": False } ) result = response.json() if "error" in result: print(f"✗ Error: {result['error']}") else: print(f"Response: {result['response']}") print() # Check Ollama's reported metrics print("Ollama Inference Metrics:") print(f" Prompt eval count: {result.get('prompt_eval_count', 'N/A')}") print(f" Prompt eval duration: {result.get('prompt_eval_duration', 0) / 1e9:.3f}s") print(f" Eval count (tokens generated): {result.get('eval_count', 'N/A')}") print(f" Eval duration: {result.get('eval_duration', 0) / 1e9:.3f}s") print(f" Total duration: {result.get('total_duration', 0) / 1e9:.3f}s") if result.get('eval_count') and result.get('eval_duration'): tokens_per_sec = result['eval_count'] / (result['eval_duration'] / 1e9) print(f" Tokens/second: {tokens_per_sec:.1f}")
Out[25]:
=== GPU Usage During Inference === Running inference and checking GPU metrics...
Out[25]:
Response: Glowing screen whispers Code and circuits dance within Mind's digital sea Ollama Inference Metrics: Prompt eval count: 32 Prompt eval duration: 3.496s Eval count (tokens generated): 16 Eval duration: 0.063s Total duration: 9.682s Tokens/second: 255.8
5. List Running Models (GPU Memory)¶
In [26]:
Copied!
print("=== Models Loaded in GPU Memory ===")
response = requests.get(f"{OLLAMA_HOST}/api/ps")
running = response.json()
if running.get("models"):
for model in running["models"]:
name = model.get("name", "Unknown")
size = model.get("size", 0) / (1024**3)
vram = model.get("size_vram", 0) / (1024**3)
expires = model.get("expires_at", "N/A")
print(f" - {name}")
print(f" Total Size: {size:.2f} GB")
print(f" VRAM Usage: {vram:.2f} GB")
print(f" Expires: {expires}")
else:
print(" No models currently loaded in memory")
print("=== Models Loaded in GPU Memory ===") response = requests.get(f"{OLLAMA_HOST}/api/ps") running = response.json() if running.get("models"): for model in running["models"]: name = model.get("name", "Unknown") size = model.get("size", 0) / (1024**3) vram = model.get("size_vram", 0) / (1024**3) expires = model.get("expires_at", "N/A") print(f" - {name}") print(f" Total Size: {size:.2f} GB") print(f" VRAM Usage: {vram:.2f} GB") print(f" Expires: {expires}") else: print(" No models currently loaded in memory")
Out[26]:
=== Models Loaded in GPU Memory ===
- llama3.2:latest
Total Size: 2.56 GB
VRAM Usage: 2.56 GB
Expires: 2025-12-28T20:29:33.116691371Z
Summary¶
This notebook verified:
- GPU Detection - nvidia-smi shows available GPUs and memory
- Ollama Connection - Server is accessible and responding
- GPU Inference - Model runs on GPU with measurable performance
- VRAM Usage - Models loaded in GPU memory
Key Metrics¶
- Tokens/second - Higher is better (GPU acceleration)
- VRAM Usage - Should match model size
- GPU Utilization - Shows GPU activity during inference