SQLite Forum

incremental_vacuum and WAL size
Login
Hello,

I noticed some behaviour I thought was a little peculiar when I was [profiling
WAL file sizes](https://theunterminatedstring.com/sqlite-vacuuming/).

For a previously populated, but now empty database, calling
`incremental_vacuum` is prone to push a lot of pages through the WAL.

However, executing many small `incremental_vacuum(N)`s to remove the same number of
free pages results in significantly less pages being moved through the WAL.

I've included a script to demonstrate this below.

This script populates the database with 100 rows of 1 MB data, then deletes them.

This produced a database with:

    Page Count: 24482 Freelist Count: 24206

When a single `incremental_vacuum` was called to remove all the pages the result
is a lot pages moved through the WAL:

    Total pages written to WAL: 23777 Total pages moved to DB: 23777

When `incremental_vacuum(200)` was called 122 times the WAL file stats are:

    Total pages written to WAL: 544 Total pages moved to DB: 544


This was run using: SQLite3 version: 3.31.1


I'll admit that I haven't went near the source code to try and make sense of it.
But I guess I was wondering if the above behaviour is expected?


```
#! /usr/bin/env python3

import sqlite3
import os
import sys


def print_versions():
    print(f"SQLite Python Module Version: {sqlite3.version_info}")
    print(f"SQLite3 version: {sqlite3.sqlite_version}")


class ScenarioHandles:
    def __init__(self):
        self.connection = None
        self.cursor = None


def _checkpoint_truncate(handles):
    handles.cursor.execute("PRAGMA wal_checkpoint(TRUNCATE);")


# Returns - (written to WAL, moved to db)
def _checkpoint_passive(handles):
    handles.cursor.execute("PRAGMA wal_checkpoint(PASSIVE);")
    row = handles.cursor.fetchone()
    return (row[1], row[2])


def _get_pages_usage(handles):
    handles.cursor.execute("PRAGMA page_count;")
    page_count = handles.cursor.fetchone()[0]
    handles.cursor.execute("PRAGMA freelist_count;")
    freelist_count = handles.cursor.fetchone()[0]

    used_count = page_count - freelist_count
    print(f"Page Count: {page_count} Freelist Count: {freelist_count}")
    return freelist_count


def _incremental_vacuum(handles, pages):
    handles.cursor.execute(f"PRAGMA incremental_vacuum({pages});")
    handles.cursor.fetchall()


def _write_100(handles, num_rows=100):
    print("Writing rows...")

    mb = 10 ** 6
    one_mb_data = "x" * mb

    for i in range(num_rows):
        handles.cursor.execute(
            "INSERT INTO Data (PrimaryKey, Stuff) VALUES (?, ?);",
            (
                i,
                one_mb_data,
            ),
        )
        handles.connection.commit()


def _delete_100(handles):
    print("Deleting rows...")
    start_row = 0
    end_row = 99
    for i in range(start_row, end_row):
        handles.cursor.execute("DELETE FROM Data WHERE PrimaryKey = ?;", (i,))
        handles.connection.commit()


def setup_database(db_file):

    handles = ScenarioHandles()
    os.makedirs(os.path.dirname(db_file), exist_ok=True)

    if os.path.exists(db_file):
        os.remove(db_file)

    handles.connection = sqlite3.connect(db_file)
    handles.cursor = handles.connection.cursor()

    handles.cursor.execute("PRAGMA auto_vacuum=2;")
    handles.cursor.execute("PRAGMA journal_mode=WAL;")

    handles.cursor.execute(
        """
    CREATE TABLE Data (
        PrimaryKey INTEGER PRIMARY KEY,
        Stuff TEXT);
    """
    )

    _write_100(handles)
    _delete_100(handles)
    _checkpoint_truncate(handles)

    handles.cursor.execute("PRAGMA wal_autocheckpoint(0);")

    # Bomb out early for manual testing
    # sys.exit(0)

    return handles


def big_incr_vacuum():
    db_file = "db/test_big.db"
    handles = setup_database(db_file)

    _get_pages_usage(handles)

    print("Starting one big incremental vacuum")
    _incremental_vacuum(handles, 0)
    (written_to_wal, moved_to_db) = _checkpoint_passive(handles)

    print(
        f"Total pages written to WAL: {written_to_wal} Total pages moved to DB: {moved_to_db}"
    )
    _get_pages_usage(handles)


def small_incr_vacuum():
    db_file = "db/test_small.db"
    handles = setup_database(db_file)

    free_pages = _get_pages_usage(handles)

    incr_vaccum_pages = 200
    steps = int(free_pages / incr_vaccum_pages) + 1
    print(
        f"Starting {steps} calls to incremental_vacuum with {incr_vaccum_pages} pages"
    )

    written_to_wal = 0
    moved_to_db = 0

    for s in range(steps):
        _incremental_vacuum(handles, 200)
        (to_wal, to_db) = _checkpoint_passive(handles)

        written_to_wal = written_to_wal + to_wal
        moved_to_db = moved_to_db + to_db

    print(
        f"Total pages written to WAL: {written_to_wal} Total pages moved to DB: {moved_to_db}"
    )
    _get_pages_usage(handles)


if __name__ == "__main__":
    print_versions()
    print("\n\n")
    big_incr_vacuum()
    print("\n\n")
    small_incr_vacuum()

```