Hello,
I noticed some behaviour I thought was a little peculiar when I was [profiling
WAL file sizes](https://theunterminatedstring.com/sqlite-vacuuming/).
For a previously populated, but now empty database, calling
`incremental_vacuum` is prone to push a lot of pages through the WAL.
However, executing many small `incremental_vacuum(N)`s to remove the same number of
free pages results in significantly less pages being moved through the WAL.
I've included a script to demonstrate this below.
This script populates the database with 100 rows of 1 MB data, then deletes them.
This produced a database with:
Page Count: 24482 Freelist Count: 24206
When a single `incremental_vacuum` was called to remove all the pages the result
is a lot pages moved through the WAL:
Total pages written to WAL: 23777 Total pages moved to DB: 23777
When `incremental_vacuum(200)` was called 122 times the WAL file stats are:
Total pages written to WAL: 544 Total pages moved to DB: 544
This was run using: SQLite3 version: 3.31.1
I'll admit that I haven't went near the source code to try and make sense of it.
But I guess I was wondering if the above behaviour is expected?
```
#! /usr/bin/env python3
import sqlite3
import os
import sys
def print_versions():
print(f"SQLite Python Module Version: {sqlite3.version_info}")
print(f"SQLite3 version: {sqlite3.sqlite_version}")
class ScenarioHandles:
def __init__(self):
self.connection = None
self.cursor = None
def _checkpoint_truncate(handles):
handles.cursor.execute("PRAGMA wal_checkpoint(TRUNCATE);")
# Returns - (written to WAL, moved to db)
def _checkpoint_passive(handles):
handles.cursor.execute("PRAGMA wal_checkpoint(PASSIVE);")
row = handles.cursor.fetchone()
return (row[1], row[2])
def _get_pages_usage(handles):
handles.cursor.execute("PRAGMA page_count;")
page_count = handles.cursor.fetchone()[0]
handles.cursor.execute("PRAGMA freelist_count;")
freelist_count = handles.cursor.fetchone()[0]
used_count = page_count - freelist_count
print(f"Page Count: {page_count} Freelist Count: {freelist_count}")
return freelist_count
def _incremental_vacuum(handles, pages):
handles.cursor.execute(f"PRAGMA incremental_vacuum({pages});")
handles.cursor.fetchall()
def _write_100(handles, num_rows=100):
print("Writing rows...")
mb = 10 ** 6
one_mb_data = "x" * mb
for i in range(num_rows):
handles.cursor.execute(
"INSERT INTO Data (PrimaryKey, Stuff) VALUES (?, ?);",
(
i,
one_mb_data,
),
)
handles.connection.commit()
def _delete_100(handles):
print("Deleting rows...")
start_row = 0
end_row = 99
for i in range(start_row, end_row):
handles.cursor.execute("DELETE FROM Data WHERE PrimaryKey = ?;", (i,))
handles.connection.commit()
def setup_database(db_file):
handles = ScenarioHandles()
os.makedirs(os.path.dirname(db_file), exist_ok=True)
if os.path.exists(db_file):
os.remove(db_file)
handles.connection = sqlite3.connect(db_file)
handles.cursor = handles.connection.cursor()
handles.cursor.execute("PRAGMA auto_vacuum=2;")
handles.cursor.execute("PRAGMA journal_mode=WAL;")
handles.cursor.execute(
"""
CREATE TABLE Data (
PrimaryKey INTEGER PRIMARY KEY,
Stuff TEXT);
"""
)
_write_100(handles)
_delete_100(handles)
_checkpoint_truncate(handles)
handles.cursor.execute("PRAGMA wal_autocheckpoint(0);")
# Bomb out early for manual testing
# sys.exit(0)
return handles
def big_incr_vacuum():
db_file = "db/test_big.db"
handles = setup_database(db_file)
_get_pages_usage(handles)
print("Starting one big incremental vacuum")
_incremental_vacuum(handles, 0)
(written_to_wal, moved_to_db) = _checkpoint_passive(handles)
print(
f"Total pages written to WAL: {written_to_wal} Total pages moved to DB: {moved_to_db}"
)
_get_pages_usage(handles)
def small_incr_vacuum():
db_file = "db/test_small.db"
handles = setup_database(db_file)
free_pages = _get_pages_usage(handles)
incr_vaccum_pages = 200
steps = int(free_pages / incr_vaccum_pages) + 1
print(
f"Starting {steps} calls to incremental_vacuum with {incr_vaccum_pages} pages"
)
written_to_wal = 0
moved_to_db = 0
for s in range(steps):
_incremental_vacuum(handles, 200)
(to_wal, to_db) = _checkpoint_passive(handles)
written_to_wal = written_to_wal + to_wal
moved_to_db = moved_to_db + to_db
print(
f"Total pages written to WAL: {written_to_wal} Total pages moved to DB: {moved_to_db}"
)
_get_pages_usage(handles)
if __name__ == "__main__":
print_versions()
print("\n\n")
big_incr_vacuum()
print("\n\n")
small_incr_vacuum()
```