Skip to content

Visual RAG

Visual RAG indexes your files in both text and image, combining the stability of text indexing with the flexibility of visual indexing. It can retrieve information from graphs, charts, and complex table layouts that text-based RAG systems miss.

What you’ll learn:

  • How to upload files and create vector stores
  • How to search with both text and image retrieval
  • How to build an end-to-end RAG pipeline

Supported formats: PDF, PPTX, DOCX, HTML, PNG

Visual RAG Workflow

import os
from openai import OpenAI
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url="https://llm-server.llmhub.t-systems.net/v1", # Note: v1 for Visual RAG
)
uploaded = client.files.create(
file=open("/path/to/your_file.pdf", "rb"),
purpose="visual-rag",
)
print(f"File ID: {uploaded.id}")
file_list = client.files.list(purpose="visual-rag")
for f in file_list.data:
print(f"ID: {f.id}, Name: {f.filename}")
file_info = client.files.retrieve("file-abc123")
print(file_info)
client.files.delete("file-abc123")
vs = client.vector_stores.create(
name="my_vs",
chunking_strategy={
"text_embedding_model": "text-embedding-bge-m3",
"vision_embedding_model": "tsi-embedding-colqwen2-2b-v1",
},
)
print(f"Vector Store ID: {vs.id}")
for vs in client.vector_stores.list():
print(f"{vs.name} ({vs.id}) - {vs.file_counts}")
client.vector_stores.files.create(
vector_store_id="xyz-456",
file_id="file-abc123",
chunking_strategy={
"chunk_size": 1024,
"chunk_overlap": 100,
},
)
files = client.vector_stores.files.list(vector_store_id="xyz-456")
for f in files.data:
print(f"{f.id} - {f.created_at}")
client.vector_stores.files.delete(
vector_store_id="xyz-456",
file_id="file-abc123",
)
client.vector_stores.delete(vector_store_id="xyz-456")

Search your vector store with both text and image results:

results = client.vector_stores.search(
vector_store_id="xyz-456",
query="Which new features are supported?",
extra_body={
"top_k_texts": 3,
"top_k_images": 2,
},
)
for result in results:
if result.content[0].type == "text":
print("Text result:", result.content[0].text[:200])
elif result.content[0].type == "base64":
print(f"Image result from {result.filename}, page {result.page_number}")

Combine retrieval with LLM inference for a complete Visual RAG pipeline:

from openai import OpenAI
rag_client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url="https://llm-server.llmhub.t-systems.net/v1",
)
llm_client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url="https://llm-server.llmhub.t-systems.net/v2",
)
# 1. Retrieve relevant context
contexts = rag_client.vector_stores.search(
vector_store_id="xyz-456",
query="What are the key metrics?",
extra_body={"top_k_texts": 5, "top_k_images": 3},
)
# 2. Build prompt with text and image contexts
content = [{"type": "text", "text": f"Based on the following context, answer: What are the key metrics?\n\n"}]
for ctx in contexts:
if ctx.content[0].type == "text":
content[0]["text"] += ctx.content[0].text + "\n\n"
elif ctx.content[0].type == "base64":
content.append({"type": "image_url", "image_url": {"url": ctx.content[0].text}})
# 3. Get answer from LLM
response = llm_client.chat.completions.create(
model="claude-sonnet-4",
messages=[{"role": "user", "content": content}],
max_tokens=2048,
)
print(response.choices[0].message.content)