Python
The Python driver is OmniData’s reference implementation. It wraps aiosqlite for async access and handles PRAGMA setup, search, and blob retrieval. You can also use the standard sqlite3 module directly for synchronous access.
Opening a container
With the OmniData driver
from omnidata import OmniDataContainer
async with OmniDataContainer("~/path/to/instance.omnidata") as omni:
resources = await omni.list_resources(limit=10)
The driver opens both index.db and memory.db, sets all required PRAGMAs (WAL, foreign_keys, cache_size) on each connection.
With raw aiosqlite
Open each database separately:
import aiosqlite
# Open index.db for resources, chunks, embeddings, search
async with aiosqlite.connect("instance.omnidata/index.db") as index_db:
await index_db.execute("PRAGMA journal_mode = WAL")
await index_db.execute("PRAGMA foreign_keys = ON")
await index_db.execute("PRAGMA cache_size = -64000")
index_db.row_factory = aiosqlite.Row
cursor = await index_db.execute(
"SELECT * FROM omnidata_resources WHERE deleted_at IS NULL LIMIT 10"
)
resources = await cursor.fetchall()
# Open memory.db for collections, edges, tags, memory
async with aiosqlite.connect("instance.omnidata/memory.db") as memory_db:
await memory_db.execute("PRAGMA journal_mode = WAL")
await memory_db.execute("PRAGMA foreign_keys = ON")
await memory_db.execute("PRAGMA cache_size = -64000")
memory_db.row_factory = aiosqlite.Row
cursor = await memory_db.execute(
"SELECT * FROM omnidata_collections WHERE deleted_at IS NULL"
)
collections = await cursor.fetchall()
With synchronous sqlite3
import sqlite3
# index.db — resources, chunks, embeddings, FTS5
index_conn = sqlite3.connect("instance.omnidata/index.db")
index_conn.row_factory = sqlite3.Row
index_conn.execute("PRAGMA journal_mode = WAL")
index_conn.execute("PRAGMA foreign_keys = ON")
index_conn.execute("PRAGMA cache_size = -64000")
resources = index_conn.execute(
"SELECT * FROM omnidata_resources WHERE deleted_at IS NULL LIMIT 10"
).fetchall()
# memory.db — collections, edges, tags, memory
memory_conn = sqlite3.connect("instance.omnidata/memory.db")
memory_conn.row_factory = sqlite3.Row
memory_conn.execute("PRAGMA journal_mode = WAL")
memory_conn.execute("PRAGMA foreign_keys = ON")
memory_conn.execute("PRAGMA cache_size = -64000")
collections = memory_conn.execute(
"SELECT * FROM omnidata_collections WHERE deleted_at IS NULL"
).fetchall()
Querying resources
All resource queries target index.db:
# All resources from a specific adapter
rows = index_conn.execute("""
SELECT id, uri, title, pipeline_state, resource_at
FROM omnidata_resources
WHERE source = ? AND deleted_at IS NULL
ORDER BY resource_at DESC
""", ("chrome-capture",)).fetchall()
# Count by pipeline state
counts = index_conn.execute("""
SELECT pipeline_state, COUNT(*)
FROM omnidata_resources
WHERE deleted_at IS NULL
GROUP BY pipeline_state
""").fetchall()
Full-text search
FTS5 queries target index.db:
results = index_conn.execute("""
SELECT c.content, r.title, r.uri
FROM fts_chunks fts
JOIN omnidata_chunks c ON c.rowid = fts.rowid
JOIN omnidata_resources r ON r.id = c.resource_id
WHERE fts_chunks MATCH ?
AND c.deleted_at IS NULL AND r.deleted_at IS NULL
ORDER BY rank
LIMIT 10
""", ("machine learning",)).fetchall()
for row in results:
print(f"{row['title']}: {row['content'][:100]}...")
Vector similarity search
Vector search targets index.db:
import struct
import math
def cosine_similarity(a: bytes, b: bytes) -> float:
n = len(a) // 4
va = struct.unpack(f"<{n}f", a)
vb = struct.unpack(f"<{n}f", b)
dot = sum(x * y for x, y in zip(va, vb))
norm = math.sqrt(sum(x*x for x in va)) * math.sqrt(sum(x*x for x in vb))
return dot / norm if norm > 0 else 0.0
# query_embedding is a bytes object (little-endian float32)
chunks = index_conn.execute("""
SELECT c.id, c.content, c.embedding, r.title
FROM omnidata_chunks c
JOIN omnidata_resources r ON r.id = c.resource_id
WHERE c.embedding IS NOT NULL
AND c.deleted_at IS NULL AND r.deleted_at IS NULL
""").fetchall()
scored = sorted(
[(cosine_similarity(query_embedding, c["embedding"]), c) for c in chunks],
key=lambda x: x[0],
reverse=True,
)[:10]
Reading blobs
Blobs are stored as content-addressed files in the blobs/ directory:
from pathlib import Path
def read_blob(container_path: str, content_hash: str) -> bytes:
blob_path = Path(container_path) / "blobs" / content_hash[:2] / content_hash
return blob_path.read_bytes()
# Get the blob for a specific resource
resource = index_conn.execute(
"SELECT content_hash FROM omnidata_resources WHERE uri = ?",
("file:///Users/daniel/report.pdf",),
).fetchone()
if resource and resource["content_hash"]:
pdf_bytes = read_blob("instance.omnidata", resource["content_hash"])
with open("report.pdf", "wb") as f:
f.write(pdf_bytes)
Reading the manifest
The manifest is a JSON file at the root of the bundle:
import json
from pathlib import Path
manifest = json.loads(
(Path("instance.omnidata") / "manifest.json").read_text()
)
print(f"Instance: {manifest['instance_name']} ({manifest['instance_id']})")
print(f"Owner: {manifest['owner_identity']}")
print(f"Schema version: {manifest['schema_version']}")