SQLite Demo
sqlite demo: indexing behaviour¶
In [1]:
Copied!
import sqlite3
import random
# Connect to database (creates file)
conn = sqlite3.connect('university.db')
c = conn.cursor()
# 1. Create a table WITHOUT an index on 'grade' or 'major'
# Note: 'id' as INTEGER PRIMARY KEY makes this a "Clustered Index" (B+ Tree)
c.execute('''
CREATE TABLE IF NOT EXISTS students (
id INTEGER PRIMARY KEY,
name TEXT,
major TEXT,
grade INTEGER,
tuition_paid BOOLEAN
)
''')
# 2. Insert 100,000 rows (The "Haystack")
majors = ['CS', 'Math', 'Physics', 'Biology', 'History', 'Art']
print("Generating 100k students... this might take a second.")
data = []
for i in range(100000):
data.append((
f"Student_{i}",
random.choice(majors),
random.randint(0, 100),
random.choice([True, False])
))
c.executemany('INSERT INTO students (name, major, grade, tuition_paid) VALUES (?, ?, ?, ?)', data)
conn.commit()
print("Database ready.")
conn.close()
import sqlite3
import random
# Connect to database (creates file)
conn = sqlite3.connect('university.db')
c = conn.cursor()
# 1. Create a table WITHOUT an index on 'grade' or 'major'
# Note: 'id' as INTEGER PRIMARY KEY makes this a "Clustered Index" (B+ Tree)
c.execute('''
CREATE TABLE IF NOT EXISTS students (
id INTEGER PRIMARY KEY,
name TEXT,
major TEXT,
grade INTEGER,
tuition_paid BOOLEAN
)
''')
# 2. Insert 100,000 rows (The "Haystack")
majors = ['CS', 'Math', 'Physics', 'Biology', 'History', 'Art']
print("Generating 100k students... this might take a second.")
data = []
for i in range(100000):
data.append((
f"Student_{i}",
random.choice(majors),
random.randint(0, 100),
random.choice([True, False])
))
c.executemany('INSERT INTO students (name, major, grade, tuition_paid) VALUES (?, ?, ?, ?)', data)
conn.commit()
print("Database ready.")
conn.close()
Generating 100k students... this might take a second. Database ready.
1. Full scan¶
EXPLAIN QUERY PLAN
SELECT * FROM students WHERE major = 'Physics';
-- Output: SCAN TABLE students
2. Lookup using B+tree¶
EXPLAIN QUERY PLAN
SELECT * FROM students WHERE id = 500;
-- Output: SEARCH TABLE students USING INTEGER PRIMARY KEY (rowid=?)
3. Create secondary index (MAP of MAP)¶
CREATE INDEX idx_major ON students(major);
-- Creates another B-Tree with key as secondary index and value as primary key
5. Double read using secondary index¶
EXPLAIN QUERY PLAN
SELECT * FROM students WHERE major = 'Physics';
-- Output: SEARCH TABLE students USING INDEX idx_major (major=?)
6. Single read¶
EXPLAIN QUERY PLAN
SELECT id FROM students WHERE major = 'Physics';
-- Output:SEARCH students USING COVERING INDEX idx_major (major=?)
Experiment: Primary key as UUID vs Integer¶
In [2]:
Copied!
import sqlite3
import uuid
import time
import os
DB_NAME = "benchmark.db"
if os.path.exists(DB_NAME):
os.remove(DB_NAME)
conn = sqlite3.connect(DB_NAME)
c = conn.cursor()
# 1. Create Tables
c.execute("CREATE TABLE int_pk (id INTEGER PRIMARY KEY, data TEXT)")
# Note: In SQLite, strictly speaking, a customized primary key is still a B-Tree,
# but random inserts hurt it just the same.
c.execute("CREATE TABLE uuid_pk (id TEXT PRIMARY KEY, data TEXT)")
ROWS = 100000
data_payload = "x" * 100 # Some payload to make pages fill up
# --- Test 1: Integer Inserts ---
start = time.time()
for i in range(ROWS):
# SQLite auto-increments INTEGER PRIMARY KEY efficiently
c.execute("INSERT INTO int_pk (data) VALUES (?)", (data_payload,))
conn.commit()
int_duration = time.time() - start
print(f"Integer Insert Time: {int_duration:.4f}s")
# --- Test 2: UUID Inserts ---
start = time.time()
for i in range(ROWS):
# Generate random UUID
uid = str(uuid.uuid4())
c.execute("INSERT INTO uuid_pk (id, data) VALUES (?, ?)", (uid, data_payload))
conn.commit()
uuid_duration = time.time() - start
print(f"UUID Insert Time: {uuid_duration:.4f}s")
# --- Test 3: Measure Fragmentation (Page Count) ---
# We use the built-in 'dbstat' virtual table if available, or just count pages via file size approximation
try:
c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='int_pk'")
int_size = c.fetchone()[0]
c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='uuid_pk'")
uuid_size = c.fetchone()[0]
print(f"\nStructure Analysis:")
print(f"Integer Table Size: {int_size / 1024 / 1024:.2f} MB")
print(f"UUID Table Size: {uuid_size / 1024 / 1024:.2f} MB")
print(f"Bloat Factor: {uuid_size / int_size:.2f}x larger")
except sqlite3.OperationalError:
print("\n(Your SQLite build doesn't support 'dbstat' to measure exact fragmentation, but the time difference should be clear!)")
conn.close()
import sqlite3
import uuid
import time
import os
DB_NAME = "benchmark.db"
if os.path.exists(DB_NAME):
os.remove(DB_NAME)
conn = sqlite3.connect(DB_NAME)
c = conn.cursor()
# 1. Create Tables
c.execute("CREATE TABLE int_pk (id INTEGER PRIMARY KEY, data TEXT)")
# Note: In SQLite, strictly speaking, a customized primary key is still a B-Tree,
# but random inserts hurt it just the same.
c.execute("CREATE TABLE uuid_pk (id TEXT PRIMARY KEY, data TEXT)")
ROWS = 100000
data_payload = "x" * 100 # Some payload to make pages fill up
# --- Test 1: Integer Inserts ---
start = time.time()
for i in range(ROWS):
# SQLite auto-increments INTEGER PRIMARY KEY efficiently
c.execute("INSERT INTO int_pk (data) VALUES (?)", (data_payload,))
conn.commit()
int_duration = time.time() - start
print(f"Integer Insert Time: {int_duration:.4f}s")
# --- Test 2: UUID Inserts ---
start = time.time()
for i in range(ROWS):
# Generate random UUID
uid = str(uuid.uuid4())
c.execute("INSERT INTO uuid_pk (id, data) VALUES (?, ?)", (uid, data_payload))
conn.commit()
uuid_duration = time.time() - start
print(f"UUID Insert Time: {uuid_duration:.4f}s")
# --- Test 3: Measure Fragmentation (Page Count) ---
# We use the built-in 'dbstat' virtual table if available, or just count pages via file size approximation
try:
c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='int_pk'")
int_size = c.fetchone()[0]
c.execute("SELECT sum(pgsize) FROM dbstat WHERE name='uuid_pk'")
uuid_size = c.fetchone()[0]
print(f"\nStructure Analysis:")
print(f"Integer Table Size: {int_size / 1024 / 1024:.2f} MB")
print(f"UUID Table Size: {uuid_size / 1024 / 1024:.2f} MB")
print(f"Bloat Factor: {uuid_size / int_size:.2f}x larger")
except sqlite3.OperationalError:
print("\n(Your SQLite build doesn't support 'dbstat' to measure exact fragmentation, but the time difference should be clear!)")
conn.close()
Integer Insert Time: 0.1180s UUID Insert Time: 0.6756s Structure Analysis: Integer Table Size: 10.59 MB UUID Table Size: 14.43 MB Bloat Factor: 1.36x larger