SQLite Demo

sqlite demo: indexing behaviour¶

In [1]:

Copied!





import sqlite3
import random

# Connect to database (creates file)
conn = sqlite3.connect('university.db')
c = conn.cursor()

# 1. Create a table WITHOUT an index on 'grade' or 'major'
# Note: 'id' as INTEGER PRIMARY KEY makes this a "Clustered Index" (B+ Tree)
c.execute('''
    CREATE TABLE IF NOT EXISTS students (
        id INTEGER PRIMARY KEY,
        name TEXT,
        major TEXT,
        grade INTEGER,
        tuition_paid BOOLEAN
    )
''')

# 2. Insert 100,000 rows (The "Haystack")
majors = ['CS', 'Math', 'Physics', 'Biology', 'History', 'Art']
print("Generating 100k students... this might take a second.")

data = []
for i in range(100000):
    data.append((
        f"Student_{i}", 
        random.choice(majors), 
        random.randint(0, 100), 
        random.choice([True, False])
    ))

c.executemany('INSERT INTO students (name, major, grade, tuition_paid) VALUES (?, ?, ?, ?)', data)
conn.commit()
print("Database ready.")
conn.close()
import sqlite3
import random

# Connect to database (creates file)
conn = sqlite3.connect('university.db')
c = conn.cursor()

# 1. Create a table WITHOUT an index on 'grade' or 'major'
# Note: 'id' as INTEGER PRIMARY KEY makes this a "Clustered Index" (B+ Tree)
c.execute('''
    CREATE TABLE IF NOT EXISTS students (
        id INTEGER PRIMARY KEY,
        name TEXT,
        major TEXT,
        grade INTEGER,
        tuition_paid BOOLEAN
    )
''')

# 2. Insert 100,000 rows (The "Haystack")
majors = ['CS', 'Math', 'Physics', 'Biology', 'History', 'Art']
print("Generating 100k students... this might take a second.")

data = []
for i in range(100000):
    data.append((
        f"Student_{i}", 
        random.choice(majors), 
        random.randint(0, 100), 
        random.choice([True, False])
    ))

c.executemany('INSERT INTO students (name, major, grade, tuition_paid) VALUES (?, ?, ?, ?)', data)
conn.commit()
print("Database ready.")
conn.close()

Generating 100k students... this might take a second.
Database ready.

1. Full scan¶

EXPLAIN QUERY PLAN 
SELECT * FROM students WHERE major = 'Physics';
-- Output: SCAN TABLE students

2. Lookup using B+tree¶

EXPLAIN QUERY PLAN 
SELECT * FROM students WHERE id = 500;
-- Output:  SEARCH TABLE students USING INTEGER PRIMARY KEY (rowid=?)

3. Create secondary index (MAP of MAP)¶

CREATE INDEX idx_major ON students(major);
-- Creates another B-Tree with key as secondary index and value as primary key

5. Double read using secondary index¶

EXPLAIN QUERY PLAN 
 SELECT * FROM students WHERE major = 'Physics';
-- Output: SEARCH TABLE students USING INDEX idx_major (major=?)

6. Single read¶

EXPLAIN QUERY PLAN 
 SELECT id FROM students WHERE major = 'Physics';
-- Output:SEARCH students USING COVERING INDEX idx_major (major=?)

Experiment: Primary key as UUID vs Integer¶

In [2]:

Copied!





import sqlite3
import uuid
import time
import os

DB_NAME = "benchmark.db"
if os.path.exists(DB_NAME):
    os.remove(DB_NAME)

conn = sqlite3.connect(DB_NAME)
c = conn.cursor()

# 1. Create Tables
c.execute("CREATE TABLE int_pk (id INTEGER PRIMARY KEY, data TEXT)")
# Note: In SQLite, strictly speaking, a customized primary key is still a B-Tree, 
# but random inserts hurt it just the same.
c.execute("CREATE TABLE uuid_pk (id TEXT PRIMARY KEY, data TEXT)")

ROWS = 100000
data_payload = "x" * 100  # Some payload to make pages fill up

# --- Test 1: Integer Inserts ---
start = time.time()
for i in range(ROWS):
    # SQLite auto-increments INTEGER PRIMARY KEY efficiently
    c.execute("INSERT INTO int_pk (data) VALUES (?)", (data_payload,))
conn.commit()
int_duration = time.time() - start
print(f"Integer Insert Time: {int_duration:.4f}s")

# --- Test 2: UUID Inserts ---
start = time.time()
for i in range(ROWS):
    # Generate random UUID
    uid = str(uuid.uuid4())
    c.execute("INSERT INTO uuid_pk (id, data) VALUES (?, ?)", (uid, data_payload))
conn.commit()
uuid_duration = time.time() - start
print(f"UUID Insert Time:    {uuid_duration:.4f}s")

# --- Test 3: Measure Fragmentation (Page Count) ---
# We use the built-in 'dbstat' virtual table if available, or just count pages via file size approximation
try:
    c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='int_pk'")
    int_size = c.fetchone()[0]
    c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='uuid_pk'")
    uuid_size = c.fetchone()[0]
    
    print(f"\nStructure Analysis:")
    print(f"Integer Table Size: {int_size / 1024 / 1024:.2f} MB")
    print(f"UUID Table Size:    {uuid_size / 1024 / 1024:.2f} MB")
    print(f"Bloat Factor:       {uuid_size / int_size:.2f}x larger")
except sqlite3.OperationalError:
    print("\n(Your SQLite build doesn't support 'dbstat' to measure exact fragmentation, but the time difference should be clear!)")

conn.close()
import sqlite3
import uuid
import time
import os

DB_NAME = "benchmark.db"
if os.path.exists(DB_NAME):
    os.remove(DB_NAME)

conn = sqlite3.connect(DB_NAME)
c = conn.cursor()

# 1. Create Tables
c.execute("CREATE TABLE int_pk (id INTEGER PRIMARY KEY, data TEXT)")
# Note: In SQLite, strictly speaking, a customized primary key is still a B-Tree, 
# but random inserts hurt it just the same.
c.execute("CREATE TABLE uuid_pk (id TEXT PRIMARY KEY, data TEXT)")

ROWS = 100000
data_payload = "x" * 100  # Some payload to make pages fill up

# --- Test 1: Integer Inserts ---
start = time.time()
for i in range(ROWS):
    # SQLite auto-increments INTEGER PRIMARY KEY efficiently
    c.execute("INSERT INTO int_pk (data) VALUES (?)", (data_payload,))
conn.commit()
int_duration = time.time() - start
print(f"Integer Insert Time: {int_duration:.4f}s")

# --- Test 2: UUID Inserts ---
start = time.time()
for i in range(ROWS):
    # Generate random UUID
    uid = str(uuid.uuid4())
    c.execute("INSERT INTO uuid_pk (id, data) VALUES (?, ?)", (uid, data_payload))
conn.commit()
uuid_duration = time.time() - start
print(f"UUID Insert Time:    {uuid_duration:.4f}s")

# --- Test 3: Measure Fragmentation (Page Count) ---
# We use the built-in 'dbstat' virtual table if available, or just count pages via file size approximation
try:
    c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='int_pk'")
    int_size = c.fetchone()[0]
    c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='uuid_pk'")
    uuid_size = c.fetchone()[0]
    
    print(f"\nStructure Analysis:")
    print(f"Integer Table Size: {int_size / 1024 / 1024:.2f} MB")
    print(f"UUID Table Size:    {uuid_size / 1024 / 1024:.2f} MB")
    print(f"Bloat Factor:       {uuid_size / int_size:.2f}x larger")
except sqlite3.OperationalError:
    print("\n(Your SQLite build doesn't support 'dbstat' to measure exact fragmentation, but the time difference should be clear!)")

conn.close()

Integer Insert Time: 0.1180s
UUID Insert Time:    0.6756s

Structure Analysis:
Integer Table Size: 10.59 MB
UUID Table Size:    14.43 MB
Bloat Factor:       1.36x larger