Documentation Specification SDKs

Python

The Python driver is OmniData’s reference implementation. It wraps aiosqlite for async access and handles PRAGMA setup, search, and blob retrieval. You can also use the standard sqlite3 module directly for synchronous access.

Opening a container

With the OmniData driver

from omnidata import OmniDataContainer

async with OmniDataContainer("~/path/to/instance.omnidata") as omni:
    resources = await omni.list_resources(limit=10)

The driver opens both index.db and memory.db, sets all required PRAGMAs (WAL, foreign_keys, cache_size) on each connection.

With raw aiosqlite

Open each database separately:

import aiosqlite

# Open index.db for resources, chunks, embeddings, search
async with aiosqlite.connect("instance.omnidata/index.db") as index_db:
    await index_db.execute("PRAGMA journal_mode = WAL")
    await index_db.execute("PRAGMA foreign_keys = ON")
    await index_db.execute("PRAGMA cache_size = -64000")
    index_db.row_factory = aiosqlite.Row

    cursor = await index_db.execute(
        "SELECT * FROM omnidata_resources WHERE deleted_at IS NULL LIMIT 10"
    )
    resources = await cursor.fetchall()

# Open memory.db for collections, edges, tags, memory
async with aiosqlite.connect("instance.omnidata/memory.db") as memory_db:
    await memory_db.execute("PRAGMA journal_mode = WAL")
    await memory_db.execute("PRAGMA foreign_keys = ON")
    await memory_db.execute("PRAGMA cache_size = -64000")
    memory_db.row_factory = aiosqlite.Row

    cursor = await memory_db.execute(
        "SELECT * FROM omnidata_collections WHERE deleted_at IS NULL"
    )
    collections = await cursor.fetchall()

With synchronous sqlite3

import sqlite3

# index.db — resources, chunks, embeddings, FTS5
index_conn = sqlite3.connect("instance.omnidata/index.db")
index_conn.row_factory = sqlite3.Row
index_conn.execute("PRAGMA journal_mode = WAL")
index_conn.execute("PRAGMA foreign_keys = ON")
index_conn.execute("PRAGMA cache_size = -64000")

resources = index_conn.execute(
    "SELECT * FROM omnidata_resources WHERE deleted_at IS NULL LIMIT 10"
).fetchall()

# memory.db — collections, edges, tags, memory
memory_conn = sqlite3.connect("instance.omnidata/memory.db")
memory_conn.row_factory = sqlite3.Row
memory_conn.execute("PRAGMA journal_mode = WAL")
memory_conn.execute("PRAGMA foreign_keys = ON")
memory_conn.execute("PRAGMA cache_size = -64000")

collections = memory_conn.execute(
    "SELECT * FROM omnidata_collections WHERE deleted_at IS NULL"
).fetchall()

Querying resources

All resource queries target index.db:

# All resources from a specific adapter
rows = index_conn.execute("""
    SELECT id, uri, title, pipeline_state, resource_at
    FROM omnidata_resources
    WHERE source = ? AND deleted_at IS NULL
    ORDER BY resource_at DESC
""", ("chrome-capture",)).fetchall()

# Count by pipeline state
counts = index_conn.execute("""
    SELECT pipeline_state, COUNT(*)
    FROM omnidata_resources
    WHERE deleted_at IS NULL
    GROUP BY pipeline_state
""").fetchall()

Full-text search

FTS5 queries target index.db:

results = index_conn.execute("""
    SELECT c.content, r.title, r.uri
    FROM fts_chunks fts
    JOIN omnidata_chunks c ON c.rowid = fts.rowid
    JOIN omnidata_resources r ON r.id = c.resource_id
    WHERE fts_chunks MATCH ?
      AND c.deleted_at IS NULL AND r.deleted_at IS NULL
    ORDER BY rank
    LIMIT 10
""", ("machine learning",)).fetchall()

for row in results:
    print(f"{row['title']}: {row['content'][:100]}...")

Vector similarity search

Vector search targets index.db:

import struct
import math

def cosine_similarity(a: bytes, b: bytes) -> float:
    n = len(a) // 4
    va = struct.unpack(f"<{n}f", a)
    vb = struct.unpack(f"<{n}f", b)
    dot = sum(x * y for x, y in zip(va, vb))
    norm = math.sqrt(sum(x*x for x in va)) * math.sqrt(sum(x*x for x in vb))
    return dot / norm if norm > 0 else 0.0

# query_embedding is a bytes object (little-endian float32)
chunks = index_conn.execute("""
    SELECT c.id, c.content, c.embedding, r.title
    FROM omnidata_chunks c
    JOIN omnidata_resources r ON r.id = c.resource_id
    WHERE c.embedding IS NOT NULL
      AND c.deleted_at IS NULL AND r.deleted_at IS NULL
""").fetchall()

scored = sorted(
    [(cosine_similarity(query_embedding, c["embedding"]), c) for c in chunks],
    key=lambda x: x[0],
    reverse=True,
)[:10]

Reading blobs

Blobs are stored as content-addressed files in the blobs/ directory:

from pathlib import Path

def read_blob(container_path: str, content_hash: str) -> bytes:
    blob_path = Path(container_path) / "blobs" / content_hash[:2] / content_hash
    return blob_path.read_bytes()

# Get the blob for a specific resource
resource = index_conn.execute(
    "SELECT content_hash FROM omnidata_resources WHERE uri = ?",
    ("file:///Users/daniel/report.pdf",),
).fetchone()

if resource and resource["content_hash"]:
    pdf_bytes = read_blob("instance.omnidata", resource["content_hash"])
    with open("report.pdf", "wb") as f:
        f.write(pdf_bytes)

Reading the manifest

The manifest is a JSON file at the root of the bundle:

import json
from pathlib import Path

manifest = json.loads(
    (Path("instance.omnidata") / "manifest.json").read_text()
)
print(f"Instance: {manifest['instance_name']} ({manifest['instance_id']})")
print(f"Owner: {manifest['owner_identity']}")
print(f"Schema version: {manifest['schema_version']}")