Reading about Ollama’s capabilities is one thing. Building something real with it reveals the gaps in the documentation, the edge cases in the API, and — importantly — just how quickly a useful application comes together.
This guide builds five complete, production-ready applications using Ollama and Python. Not toy examples — applications structured the way you would actually deploy them, with proper error handling, configuration management, and streaming where it matters. Each project adds a layer of complexity and teaches something that the simpler ones do not.
By the end you will have working code for: a smart CLI assistant, a batch document summarizer, a private customer support chatbot (FastAPI + streaming), a code review tool, and a local AI API proxy server.
🔗 This is Post #11 in the Ollama Unlocked series. Requires Ollama installed and running — see Ollama Masterclass (Post #1). For integrating document retrieval into these apps, see RAG with Ollama (Post #10).
Setup: Common Dependencies
pip install ollama fastapi uvicorn python-dotenv rich click pypdf
Create a .env file for configuration:
OLLAMA_HOST=http://localhost:11434
DEFAULT_MODEL=llama4:scout
FAST_MODEL=qwen3:7b
EMBED_MODEL=nomic-embed-text
Project 1: Smart CLI Assistant
A terminal-based AI assistant with conversation history, model switching, and rich formatting.
#!/usr/bin/env python3
# cli_assistant.py
import os
import json
import click
import ollama
from rich.console import Console
from rich.markdown import Markdown
from rich.prompt import Prompt
from pathlib import Path
from datetime import datetime
console = Console()
class ChatSession:
def __init__(self, model: str, system_prompt: str = None):
self.model = model
self.messages = []
self.history_file = Path.home() / ".ollama_chat_history.json"
if system_prompt:
self.messages.append({
"role": "system",
"content": system_prompt
})
def chat(self, user_input: str) -> str:
self.messages.append({"role": "user", "content": user_input})
full_response = ""
console.print("\n[bold blue]Assistant:[/bold blue] ", end="")
# Stream the response
stream = ollama.chat(
model=self.model,
messages=self.messages,
stream=True
)
for chunk in stream:
content = chunk["message"]["content"]
full_response += content
console.print(content, end="", markup=False)
console.print() # Newline after response
self.messages.append({
"role": "assistant",
"content": full_response
})
return full_response
def save_history(self):
history = {
"timestamp": datetime.now().isoformat(),
"model": self.model,
"messages": [m for m in self.messages if m["role"] != "system"]
}
all_history = []
if self.history_file.exists():
all_history = json.loads(self.history_file.read_text())
all_history.append(history)
# Keep last 50 sessions
all_history = all_history[-50:]
self.history_file.write_text(json.dumps(all_history, indent=2))
def clear(self):
system_msgs = [m for m in self.messages if m["role"] == "system"]
self.messages = system_msgs
console.print("[yellow]Conversation cleared.[/yellow]")
@click.command()
@click.option("--model", "-m", default="llama4:scout",
help="Model to use")
@click.option("--system", "-s", default=None,
help="System prompt")
@click.option("--one-shot", "-o", default=None,
help="Single prompt (non-interactive)")
def main(model: str, system: str, one_shot: str):
"""Local AI assistant powered by Ollama."""
# Verify Ollama is running
try:
models = ollama.list()
except Exception:
console.print("[red]Error: Ollama is not running. Start with: ollama serve[/red]")
return
default_system = """You are a helpful, knowledgeable assistant.
Be direct and concise. Use markdown formatting when it improves clarity.
For code, always use appropriate code blocks."""
session = ChatSession(model, system or default_system)
console.print(f"[bold green]Ollama Assistant[/bold green] — Model: [cyan]{model}[/cyan]")
console.print("Commands: /clear (reset), /model (switch), /save (history), /quit\n")
# One-shot mode
if one_shot:
session.chat(one_shot)
return
# Interactive mode
while True:
try:
user_input = Prompt.ask("\n[bold green]You[/bold green]")
except (KeyboardInterrupt, EOFError):
session.save_history()
console.print("\n[yellow]Session saved. Goodbye![/yellow]")
break
if not user_input.strip():
continue
# Handle commands
if user_input.startswith("/"):
cmd = user_input[1:].lower().split()[0]
if cmd == "quit" or cmd == "exit":
session.save_history()
console.print("[yellow]Goodbye![/yellow]")
break
elif cmd == "clear":
session.clear()
elif cmd == "save":
session.save_history()
console.print(f"[green]Saved to {session.history_file}[/green]")
elif cmd == "model":
available = [m["name"] for m in ollama.list()["models"]]
console.print("Available models:")
for m in available:
marker = "→" if m == session.model else " "
console.print(f" {marker} {m}")
new_model = Prompt.ask("Switch to model (or press Enter to keep current)")
if new_model and new_model in available:
session.model = new_model
console.print(f"[green]Switched to {new_model}[/green]")
else:
console.print(f"[red]Unknown command: /{cmd}[/red]")
continue
session.chat(user_input)
if __name__ == "__main__":
main()
Run it:
python cli_assistant.py
python cli_assistant.py --model qwen3.6:27b
python cli_assistant.py --one-shot "What is a Python generator?"
Project 2: Batch Document Summarizer
Process a folder of documents and generate structured summaries in parallel.
#!/usr/bin/env python3
# doc_summarizer.py
import ollama
import pypdf
import json
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class DocumentSummary:
filename: str
summary: str
key_points: list[str]
word_count: int
processing_time: float
error: Optional[str] = None
def extract_text(file_path: Path) -> str:
"""Extract text from PDF or text file."""
if file_path.suffix.lower() == ".pdf":
reader = pypdf.PdfReader(str(file_path))
return " ".join(page.extract_text() for page in reader.pages)
else:
return file_path.read_text(encoding="utf-8", errors="ignore")
def summarize_document(file_path: Path, model: str = "qwen3:7b") -> DocumentSummary:
"""Summarize a single document."""
start = time.time()
try:
text = extract_text(file_path)
word_count = len(text.split())
# Trim to model context limit
max_chars = 12000
if len(text) > max_chars:
text = text[:max_chars] + "\n...[truncated]"
response = ollama.generate(
model=model,
prompt=f"""Analyze this document and return a JSON object with:
\{
"summary": "2-3 sentence executive summary",
"key_points": ["point 1", "point 2", "point 3", "point 4", "point 5"],
"document_type": "type of document",
"main_topic": "primary subject"
\}
Document:
\{text\}
Return only valid JSON.""",
format="json",
options={"temperature": 0.1, "num_ctx": 8192}
)
data = json.loads(response["response"])
return DocumentSummary(
filename=file_path.name,
summary=data.get("summary", ""),
key_points=data.get("key_points", []),
word_count=word_count,
processing_time=round(time.time() - start, 2)
)
except Exception as e:
return DocumentSummary(
filename=file_path.name,
summary="",
key_points=[],
word_count=0,
processing_time=round(time.time() - start, 2),
error=str(e)
)
def process_folder(folder_path: str, model: str = "qwen3:7b",
workers: int = 2) -> list[DocumentSummary]:
"""Process all documents in a folder."""
folder = Path(folder_path)
supported = {".pdf", ".txt", ".md", ".docx"}
files = [f for f in folder.iterdir() if f.suffix.lower() in supported]
print(f"Processing {len(files)} documents with {workers} workers...")
summaries = []
# Use thread pool for parallel processing
# Note: Ollama serializes GPU inference, but IO + CPU work is parallel
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {
executor.submit(summarize_document, f, model): f
for f in files
}
for future in as_completed(futures):
file_path = futures[future]
summary = future.result()
summaries.append(summary)
status = "✓" if not summary.error else "✗"
print(f" {status} {summary.filename} "
f"({summary.processing_time}s, {summary.word_count} words)")
return summaries
def generate_report(summaries: list[DocumentSummary], output_path: str):
"""Generate a markdown report from summaries."""
lines = ["# Document Summary Report\n"]
successful = [s for s in summaries if not s.error]
failed = [s for s in summaries if s.error]
lines.append(f"**Processed**: {len(successful)} documents")
lines.append(f"**Failed**: {len(failed)} documents\n")
for summary in sorted(successful, key=lambda s: s.filename):
lines.append(f"## {summary.filename}")
lines.append(f"\n{summary.summary}\n")
lines.append("**Key Points:**")
for point in summary.key_points:
lines.append(f"- {point}")
lines.append(f"\n*{summary.word_count} words · {summary.processing_time}s*\n")
if failed:
lines.append("## Failed Documents")
for f in failed:
lines.append(f"- **{f.filename}**: {f.error}")
Path(output_path).write_text("\n".join(lines))
print(f"\nReport saved to {output_path}")
if __name__ == "__main__":
import sys
folder = sys.argv[1] if len(sys.argv) > 1 else "documents"
summaries = process_folder(folder, model="qwen3:7b", workers=2)
generate_report(summaries, "summary_report.md")
# Also save raw JSON
json_output = [asdict(s) for s in summaries]
Path("summaries.json").write_text(json.dumps(json_output, indent=2))
Run it:
python doc_summarizer.py ./my_documents
Project 3: Private Customer Support Chatbot (FastAPI + Streaming)
A web API for a customer support bot with streaming responses and conversation management.
# support_bot.py
import ollama
import uuid
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional
import json
app = FastAPI(title="Private Support Bot")
# In-memory session store (use Redis for production)
sessions: dict[str, list] = {}
SYSTEM_PROMPT = """You are a helpful customer support agent for TechCorp software.
PRODUCT INFO:
- TechCorp Pro: $49/month, up to 10 users, all features
- TechCorp Team: $29/month per user, unlimited users, collaboration features
- TechCorp Free: Up to 3 users, basic features, community support only
POLICIES:
- Free 14-day trial for Pro and Team plans
- Cancel anytime, no long-term contracts
- Data export available at any time in CSV/JSON format
- 99.9% uptime SLA for paid plans
If you cannot help with something, say so directly and provide
the support email: support@techcorp.example.com
Be concise and helpful. Do not make up information not listed above."""
class ChatRequest(BaseModel):
message: str
session_id: Optional[str] = None
class ChatResponse(BaseModel):
response: str
session_id: str
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""Non-streaming chat endpoint."""
session_id = request.session_id or str(uuid.uuid4())
if session_id not in sessions:
sessions[session_id] = [
{"role": "system", "content": SYSTEM_PROMPT}
]
sessions[session_id].append({
"role": "user", "content": request.message
})
# Keep last 20 messages + system prompt
if len(sessions[session_id]) > 21:
system = sessions[session_id][0]
sessions[session_id] = [system] + sessions[session_id][-20:]
try:
response = ollama.chat(
model="llama4:scout",
messages=sessions[session_id],
options={"temperature": 0.3}
)
assistant_message = response["message"]["content"]
sessions[session_id].append({
"role": "assistant", "content": assistant_message
})
return ChatResponse(
response=assistant_message,
session_id=session_id
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/chat/stream")
async def chat_stream(request: ChatRequest):
"""Streaming chat endpoint — tokens arrive as they generate."""
session_id = request.session_id or str(uuid.uuid4())
if session_id not in sessions:
sessions[session_id] = [
{"role": "system", "content": SYSTEM_PROMPT}
]
sessions[session_id].append({
"role": "user", "content": request.message
})
def generate():
full_response = ""
# First yield the session ID
yield f"data: {json.dumps({'session_id': session_id, 'type': 'meta'})}\n\n"
stream = ollama.chat(
model="llama4:scout",
messages=sessions[session_id],
stream=True,
options={"temperature": 0.3}
)
for chunk in stream:
content = chunk["message"]["content"]
full_response += content
yield f"data: {json.dumps({'content': content, 'type': 'token'})}\n\n"
# Save complete response to session
sessions[session_id].append({
"role": "assistant", "content": full_response
})
yield f"data: {json.dumps({'type': 'done'})}\n\n"
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Session-ID": session_id
}
)
@app.delete("/chat/{session_id}")
async def clear_session(session_id: str):
if session_id in sessions:
del sessions[session_id]
return {"status": "cleared"}
@app.get("/health")
async def health():
try:
ollama.list()
return {"status": "healthy", "ollama": "connected"}
except Exception:
return {"status": "degraded", "ollama": "disconnected"}
Run it:
uvicorn support_bot:app --reload --port 8000
# Test it
curl -X POST http://localhost:8000/chat \
-H "Content-Type: application/json" \
-d '{"message": "What plans do you offer?"}'
Project 4: Code Review Tool
A CLI tool that reviews code files and generates structured feedback.
#!/usr/bin/env python3
# code_reviewer.py
import ollama
import sys
import json
from pathlib import Path
REVIEW_PROMPT = """Review this code as a senior software engineer.
Organize feedback by severity:
[Critical] - Security vulnerabilities, data loss risk, logic errors in critical paths
[High] - Performance issues under load, important missing error handling
[Medium] - Maintainability problems, suboptimal patterns, missing tests
[Low] - Minor improvements
[Style] - Formatting/naming (only mention if significantly inconsistent)
For each issue:
- Location: filename:line_number (estimate if exact line unknown)
- Issue: What is wrong
- Why: Why it matters
- Fix: Specific code change or approach
Return as JSON:
\{
"overall_rating": 1-10,
"summary": "2-3 sentence overall assessment",
"issues": [
\{
"severity": "Critical|High|Medium|Low|Style",
"location": "file:line",
"issue": "description",
"why": "impact",
"fix": "specific fix"
\}
],
"strengths": ["what is done well"]
\}
Code to review:
```{language}
{code}
```"""
def detect_language(file_path: Path) -> str:
extensions = {
".py": "python", ".js": "javascript", ".ts": "typescript",
".go": "go", ".rs": "rust", ".java": "java", ".rb": "ruby",
".cpp": "cpp", ".c": "c", ".sh": "bash", ".sql": "sql"
}
return extensions.get(file_path.suffix.lower(), "text")
def review_file(file_path: str, model: str = "qwen3.6:27b") -> dict:
path = Path(file_path)
if not path.exists():
return {"error": f"File not found: {file_path}"}
code = path.read_text(encoding="utf-8")
language = detect_language(path)
print(f"Reviewing {path.name} ({language})...")
response = ollama.generate(
model=model,
prompt=REVIEW_PROMPT.format(language=language, code=code),
format="json",
options={
"temperature": 0.1,
"num_ctx": 32768
}
)
try:
return json.loads(response["response"])
except json.JSONDecodeError:
return {"raw_response": response["response"]}
def format_review(review: dict, filename: str) -> str:
if "error" in review:
return f"Error: {review['error']}"
lines = [f"\n{'='*60}"]
lines.append(f"CODE REVIEW: {filename}")
lines.append(f"{'='*60}")
rating = review.get("overall_rating", "N/A")
lines.append(f"\nOverall Rating: {rating}/10")
lines.append(f"\nSummary: {review.get('summary', '')}\n")
issues = review.get("issues", [])
if issues:
severity_order = ["Critical", "High", "Medium", "Low", "Style"]
issues_sorted = sorted(
issues,
key=lambda x: severity_order.index(x.get("severity", "Low"))
)
lines.append("ISSUES FOUND:")
lines.append("-" * 40)
for issue in issues_sorted:
severity = issue.get("severity", "Unknown")
location = issue.get("location", "")
prefix = {
"Critical": "🔴", "High": "🟠", "Medium": "🟡",
"Low": "🔵", "Style": "⚪"
}.get(severity, "•")
lines.append(f"\n{prefix} [{severity}] {location}")
lines.append(f" Issue: {issue.get('issue', '')}")
lines.append(f" Why: {issue.get('why', '')}")
lines.append(f" Fix: {issue.get('fix', '')}")
else:
lines.append("✓ No significant issues found")
strengths = review.get("strengths", [])
if strengths:
lines.append("\nSTRENGTHS:")
for strength in strengths:
lines.append(f" ✓ {strength}")
return "\n".join(lines)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python code_reviewer.py <file_or_directory>")
sys.exit(1)
target = Path(sys.argv[1])
model = sys.argv[2] if len(sys.argv) > 2 else "qwen3.6:27b"
if target.is_file():
review = review_file(str(target), model)
print(format_review(review, target.name))
elif target.is_dir():
code_extensions = {".py", ".js", ".ts", ".go", ".rs", ".java"}
files = [f for f in target.rglob("*") if f.suffix in code_extensions]
for file_path in files[:10]: # Limit to 10 files
review = review_file(str(file_path), model)
print(format_review(review, file_path.name))
Run it:
python code_reviewer.py my_script.py
python code_reviewer.py ./src/ # Review all code in directory
python code_reviewer.py my_script.py kimi-k2.6 # Use different model
Project 5: Local AI API Proxy Server
A server that exposes Ollama models through an OpenAI-compatible API — useful for teams sharing one Ollama server.
# ai_proxy.py
import ollama
import time
import uuid
from fastapi import FastAPI, HTTPException, Header
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional
import json
app = FastAPI(title="Local AI Proxy", version="1.0.0")
# Simple API key auth (set your own)
VALID_API_KEYS = {"local-key-1", "local-key-2"}
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str
messages: list[Message]
stream: bool = False
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = None
def validate_key(authorization: str = Header(None)):
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing API key")
key = authorization.split(" ")[1]
if key not in VALID_API_KEYS:
raise HTTPException(status_code=401, detail="Invalid API key")
@app.post("/v1/chat/completions")
async def chat_completions(
request: ChatCompletionRequest,
authorization: str = Header(None)
):
"""OpenAI-compatible chat completions endpoint."""
validate_key(authorization)
messages = [{"role": m.role, "content": m.content}
for m in request.messages]
options = {}
if request.temperature is not None:
options["temperature"] = request.temperature
if request.max_tokens is not None:
options["num_predict"] = request.max_tokens
if request.stream:
def generate():
stream = ollama.chat(
model=request.model,
messages=messages,
stream=True,
options=options
)
for chunk in stream:
content = chunk["message"]["content"]
data = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": request.model,
"choices": [{"delta": {"content": content}, "index": 0}]
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
response = ollama.chat(
model=request.model,
messages=messages,
options=options
)
return {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion",
"created": int(time.time()),
"model": request.model,
"choices": [{
"index": 0,
"message": response["message"],
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": response.get("prompt_eval_count", 0),
"completion_tokens": response.get("eval_count", 0),
"total_tokens": (response.get("prompt_eval_count", 0) +
response.get("eval_count", 0))
}
}
@app.get("/v1/models")
async def list_models(authorization: str = Header(None)):
validate_key(authorization)
models = ollama.list()["models"]
return {
"object": "list",
"data": [
{"id": m["name"], "object": "model", "created": 0, "owned_by": "ollama"}
for m in models
]
}
Run it:
uvicorn ai_proxy:app --host 0.0.0.0 --port 8080
# Use from any OpenAI-compatible tool
curl http://your-server:8080/v1/chat/completions \
-H "Authorization: Bearer local-key-1" \
-H "Content-Type: application/json" \
-d '{"model": "llama4:scout", "messages": [{"role": "user", "content": "Hello"}]}'
Production Checklist
Before deploying any Ollama-powered application:
- Ollama configured to start automatically (
systemctl enable ollama) - Model pre-pulled — no runtime downloads in production
- Error handling for Ollama connection failures
- Input length validation (prevent context window overflow)
- Rate limiting if exposed to multiple users
- Logging for monitoring and debugging
- Health check endpoint that verifies Ollama is responsive
- Graceful handling of model loading delays (first request is slowest)
Conclusion
These five projects cover the most common patterns for Ollama-powered applications: interactive chat, batch processing, web APIs with streaming, structured analysis, and API proxying. Each can be deployed on any machine running Ollama.
Your next step: Pick the project closest to something you actually need. Run it with your own data. The experience of building a real application reveals requirements that reading documentation does not.
📚 Continue the Series:
Last updated: May 2026.