The most common frustration with AI chat tools is this: you want to ask questions about your documents — your company’s policies, your research papers, your codebase documentation, your client contracts — and the AI does not know them. You could paste the documents in, but they are too long. You could summarize them first, but that loses the details. And if the documents contain sensitive information, pasting them into ChatGPT or Claude means sending confidential data to cloud servers.
Retrieval-Augmented Generation (RAG) solves this. Instead of including entire documents in every prompt, RAG:
- Converts your documents into searchable vector embeddings (stored locally)
- At query time, finds the most relevant passages for your question
- Passes only those relevant passages to the model as context
- Gets an accurate, grounded answer with citations
Built on Ollama, the entire pipeline runs locally — embeddings, retrieval, and generation. Your documents never leave your machine.
🔗 This is Post #10 in the Ollama Unlocked series. Requires Ollama with embedding models — see Ollama Masterclass (Post #1). For production deployment of RAG systems, see Ollama on Docker and Production (Post #12).
How RAG Works
INDEXING PHASE (run once):
Documents → Chunks → Embeddings → Vector Store
QUERY PHASE (run per question):
Question → Embedding → Search Vector Store → Retrieve Top-K Chunks
→ Build Prompt (question + chunks) → LLM → Answer with citations
The key insight: embeddings are mathematical representations of text meaning. Similar concepts have similar embeddings. Searching the vector store finds chunks that are semantically similar to your question — not just keyword matches.
Required Components
# Install Python dependencies
pip install ollama chromadb langchain-community pypdf python-docx
# Pull the models you need
ollama pull nomic-embed-text # Embedding model (lightweight, fast)
ollama pull llama4:scout # Generation model
Why these choices:
- nomic-embed-text: Best balance of embedding quality and speed for local use. 768-dimensional embeddings, fast inference.
- ChromaDB: Local vector database — runs in-memory or persisted to disk, no server needed.
- llama4:scout: Strong generation with long context for synthesizing retrieved passages.
Building the RAG Pipeline: Step by Step
Step 1: Document Loading and Chunking
# document_loader.py
from pathlib import Path
import pypdf
import docx
def load_pdf(file_path: str) -> str:
"""Extract text from a PDF file."""
reader = pypdf.PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def load_docx(file_path: str) -> str:
"""Extract text from a Word document."""
doc = docx.Document(file_path)
return "\n".join(paragraph.text for paragraph in doc.paragraphs)
def load_txt(file_path: str) -> str:
"""Load plain text file."""
return Path(file_path).read_text(encoding="utf-8")
def load_document(file_path: str) -> str:
"""Auto-detect file type and load text."""
path = Path(file_path)
loaders = {
".pdf": load_pdf,
".docx": load_docx,
".doc": load_docx,
".txt": load_txt,
".md": load_txt,
".py": load_txt,
".js": load_txt,
".ts": load_txt,
}
loader = loaders.get(path.suffix.lower())
if not loader:
raise ValueError(f"Unsupported file type: {path.suffix}")
return loader(file_path)
def chunk_text(text: str, chunk_size: int = 500,
overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks."""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at a sentence boundary
if end < len(text):
# Look back up to 100 chars for a period
for i in range(end, max(end - 100, start), -1):
if text[i] in ".!?\n":
end = i + 1
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap # Overlap for context continuity
return chunks
Step 2: Embedding and Vector Store
# vector_store.py
import chromadb
import ollama
from typing import List
class LocalVectorStore:
def __init__(self, collection_name: str = "documents",
persist_path: str = "./chroma_db"):
"""Initialize ChromaDB with persistent storage."""
self.client = chromadb.PersistentClient(path=persist_path)
self.collection = self.client.get_or_create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"}
)
self.embed_model = "nomic-embed-text"
def embed_text(self, text: str) -> List[float]:
"""Generate embedding for a text string."""
response = ollama.embeddings(
model=self.embed_model,
prompt=text
)
return response["embedding"]
def add_documents(self, chunks: List[str],
source: str, batch_size: int = 50):
"""Add document chunks to the vector store."""
print(f"Embedding {len(chunks)} chunks from {source}...")
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
batch_num = i // batch_size + 1
total_batches = (len(chunks) + batch_size - 1) // batch_size
print(f" Batch {batch_num}/{total_batches}...")
embeddings = [self.embed_text(chunk) for chunk in batch]
ids = [f"{source}_{i + j}" for j in range(len(batch))]
metadatas = [{"source": source, "chunk_index": i + j}
for j in range(len(batch))]
self.collection.add(
documents=batch,
embeddings=embeddings,
ids=ids,
metadatas=metadatas
)
print(f" ✓ Added {len(chunks)} chunks")
def search(self, query: str, top_k: int = 5) -> List[dict]:
"""Find most relevant chunks for a query."""
query_embedding = self.embed_text(query)
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=top_k
)
return [
{
"text": doc,
"source": meta["source"],
"chunk_index": meta["chunk_index"],
"distance": dist
}
for doc, meta, dist in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0]
)
]
def count(self) -> int:
return self.collection.count()
Step 3: The RAG Pipeline
# rag_pipeline.py
import ollama
from document_loader import load_document, chunk_text
from vector_store import LocalVectorStore
from pathlib import Path
class LocalRAG:
def __init__(self,
collection_name: str = "my_documents",
generation_model: str = "llama4:scout"):
self.store = LocalVectorStore(collection_name)
self.generation_model = generation_model
def index_document(self, file_path: str):
"""Load, chunk, and index a document."""
print(f"Indexing: {file_path}")
text = load_document(file_path)
chunks = chunk_text(text, chunk_size=500, overlap=50)
source_name = Path(file_path).name
self.store.add_documents(chunks, source=source_name)
print(f" ✓ Indexed {len(chunks)} chunks from {source_name}")
def index_folder(self, folder_path: str):
"""Index all supported documents in a folder."""
supported = {".pdf", ".docx", ".txt", ".md", ".py", ".js"}
folder = Path(folder_path)
files = [f for f in folder.rglob("*")
if f.suffix.lower() in supported]
print(f"Found {len(files)} files to index...")
for file_path in files:
try:
self.index_document(str(file_path))
except Exception as e:
print(f" ✗ Failed: {file_path.name} — {e}")
print(f"\n✓ Indexed {self.store.count()} total chunks")
def query(self, question: str, top_k: int = 5,
verbose: bool = False) -> str:
"""Query the RAG system with a natural language question."""
# Retrieve relevant chunks
results = self.store.search(question, top_k=top_k)
if not results:
return "No relevant documents found for your question."
if verbose:
print(f"\nRetrieved {len(results)} relevant chunks:")
for r in results:
print(f" [{r['source']}] distance={r['distance']:.3f}")
# Build context from retrieved chunks
context_parts = []
for i, result in enumerate(results):
context_parts.append(
f"[Source {i+1}: {result['source']}]\n{result['text']}"
)
context = "\n\n---\n\n".join(context_parts)
# Build prompt
prompt = f"""Answer the question based on the provided context.
If the answer is not in the context, say "I don't have information
about that in the provided documents."
Always cite which source(s) you used in your answer using [Source N].
Context:
{context}
Question: {question}
Answer:"""
# Generate response
response = ollama.generate(
model=self.generation_model,
prompt=prompt,
options={"num_ctx": 16384, "temperature": 0.1}
)
return response["response"]
def interactive_chat(self):
"""Run an interactive Q&A session."""
print(f"\nRAG System Ready — {self.store.count()} chunks indexed")
print("Type 'quit' to exit, 'verbose' to toggle source display\n")
verbose = False
while True:
question = input("You: ").strip()
if question.lower() == "quit":
break
elif question.lower() == "verbose":
verbose = not verbose
print(f"Verbose mode: {'ON' if verbose else 'OFF'}")
continue
elif not question:
continue
answer = self.query(question, verbose=verbose)
print(f"\nAssistant: {answer}\n")
Step 4: Using the Pipeline
# main.py
from rag_pipeline import LocalRAG
# Initialize RAG system
rag = LocalRAG(
collection_name="company_docs",
generation_model="llama4:scout"
)
# Index your documents (only needs to run once)
# rag.index_document("company_policy.pdf")
# rag.index_folder("documentation/")
# Query your documents
answer = rag.query("What is our vacation policy for remote employees?")
print(answer)
# Or start interactive mode
rag.interactive_chat()
Complete Working Example: Company Policy Q&A
# Project structure
mkdir company-rag && cd company-rag
pip install ollama chromadb pypdf python-docx
# Create and run the indexer
python -c "
from rag_pipeline import LocalRAG
rag = LocalRAG('company_docs')
rag.index_folder('policies/') # Your PDF/DOCX folder
print('Indexing complete!')
"
# Query the documents
python -c "
from rag_pipeline import LocalRAG
rag = LocalRAG('company_docs') # Loads from disk — no re-indexing needed
rag.interactive_chat()
"
RAG With Open WebUI (No Code Required)
Open WebUI has built-in RAG — no Python code needed:
- Open WebUI → Settings → Documents
- Enable RAG and configure chunk size (500 recommended)
- Select embedding model:
nomic-embed-text - Upload documents via the Documents section
- In chat: toggle the RAG switch to search across all documents
- Use
#filenamesyntax to target specific documents
This gives a full RAG experience through a chat interface with zero programming.
Optimizing RAG Quality
Chunk Size Tuning
| Document Type | Recommended Chunk Size | Overlap |
|---|---|---|
| Legal documents | 400–600 tokens | 50–100 |
| Technical docs | 300–500 tokens | 50 |
| Code files | 200–400 tokens | 30 |
| Research papers | 500–800 tokens | 100 |
| Meeting notes | 200–300 tokens | 25 |
Better Retrieval: Hybrid Search
Combining semantic search (embeddings) with keyword search improves recall:
# Add BM25 keyword search alongside vector search
from rank_bm25 import BM25Okapi
class HybridSearch:
def __init__(self, chunks: list[str]):
self.chunks = chunks
tokenized = [chunk.lower().split() for chunk in chunks]
self.bm25 = BM25Okapi(tokenized)
def search(self, query: str, top_k: int = 5) -> list[int]:
scores = self.bm25.get_scores(query.lower().split())
top_indices = sorted(range(len(scores)),
key=lambda i: scores[i],
reverse=True)[:top_k]
return top_indices
Re-ranking Results
After retrieval, re-rank chunks by relevance to the specific question:
def rerank_chunks(query: str, chunks: list[str],
model: str = "llama4:scout") -> list[str]:
"""Use the LLM to re-rank retrieved chunks by relevance."""
scored_chunks = []
for chunk in chunks:
response = ollama.generate(
model=model,
prompt=f"""Rate the relevance of this text passage to the question
on a scale of 1-10. Return only the number.
Question: {query}
Passage: {chunk[:500]}
Relevance score:""",
options={"temperature": 0, "num_predict": 5}
)
try:
score = float(response["response"].strip().split()[0])
except (ValueError, IndexError):
score = 5.0
scored_chunks.append((score, chunk))
scored_chunks.sort(key=lambda x: x[0], reverse=True)
return [chunk for _, chunk in scored_chunks]
Conclusion
A fully private RAG system — indexing your documents, querying them with natural language, getting accurate cited answers — takes less than 100 lines of Python and runs entirely on your local machine. The pipeline in this guide is production-capable with the optimization steps applied.
Your next step: Create a folder with 3–5 PDFs you regularly need to reference. Run the indexer. Ask the system your most common questions about those documents. Measure whether the answers are accurate and cited. That test tells you if RAG on your documents is worth building into your workflow.
📚 Continue the Series:
- ← Previous The Ollama API: OpenAI-Compatible Local Server
- Next → Building AI Apps With Ollama and Python
- For deployment Ollama on Docker and Production
Last updated: May 2026.