I'm just working with storing a file system information on a database to reorganize/cleanup it and did the following (just in case it can help others):
1 - Got a list of all files (in linux) with "ls -AlRGg --full-time > all-files.txt".
2- Then parsed and inserted then in a sqlite database with the schema shown at the end.
3- For quering purposes I also created this index after the database was populated:
====
CREATE INDEX "files_size_idx" ON "files"(size)
====
4 - Create some list of interesting file extensions using this query:
====
--create view if not exists folders_by_type_size_view as
select
/*hp.id as fp7_id,*/ hp.name as fp7,
/*gp.id as fp6_id,*/ gp.name as fp6,
/*fp.id as fp5_id,*/ fp.name as fp5,
/*ep.id as fp4_id,*/ ep.name as fp4,
/*dp.id as fp3_id,*/ dp.name as fp3,
/*cp.id as fp2_id,*/ cp.name as fp2,
/*bp.id as fp1_id,*/ bp.name as fp1,
a.* from (
select folder_id as parent_id, folder, sum(size) as total_size from files_list_view where name like '%.h' group by folder_id order by 3 desc limit 1000
) as a
left join folders as b on a.parent_id=b.id
left join folders_list_view as bp on b.parent_id=bp.id
left join folders_list_view as cp on bp.parent_id=cp.id
left join folders_list_view as dp on cp.parent_id=dp.id
left join folders_list_view as ep on dp.parent_id=ep.id
left join folders_list_view as fp on ep.parent_id=fp.id
left join folders_list_view as gp on fp.parent_id=gp.id
left join folders_list_view as hp on gp.parent_id=hp.id;
====
Here is the result of querying stats_view:
====
tbl|cnt|total_size
files|16433449|1931486899532
folders|2522473|10573008896
====
And here is the script I used to parse and populate the database (using https://github.com/mingodad/squilu):
====
// generated with: ls -AlRGg --full-time
auto ls_fname = "all-files.txt";
//if(!existsfile(ls_fname)) os.system("ls -AlRGg --full-time > " + ls_fname);
auto db = SQLite3(ls_fname + ".db");
db.exec_dml([==[
create table if not exists names(
id integer primary key not null,
name varchar not null unique
);
delete from names;
create table if not exists folders(
id integer primary key not null,
parent_id integer check(id != parent_id) references folders(id),
name_id integer not null references names(id),
size integer not null,
cdate datetime not null,
unique(parent_id, name_id)
);
delete from folders;
create table if not exists files(
id integer primary key not null,
folder_id integer not null references folders(id),
name_id integer not null references names(id),
size integer not null,
cdate datetime not null,
hash varchar,
unique(folder_id, name_id)
);
delete from files;
CREATE VIEW if not exists folders_list_view AS
SELECT
a.id,
b.name,
a.parent_id, --c.id,
a.name_id --b.id
FROM folders AS a
LEFT JOIN names AS b ON a.name_id = b.id;
CREATE VIEW if not exists files_list_view AS
SELECT
a.id,
d.name as folder,
b.name,
a.size,
a.cdate,
a.folder_id, --c.id,
a.name_id --b.id,
FROM files AS a
LEFT JOIN names AS b ON a.name_id = b.id
LEFT JOIN folders AS c ON a.folder_id = c.id
LEFT JOIN names AS d ON c.name_id = d.id;
create view if not exists stats_view as
select 'folders' as tbl, count(*) cnt, sum(size) total_size from folders
UNION
select 'files' as tbl, count(*) cnt, sum(size) total_size from files;
]==]);
auto stmt_name_insert = db.prepare("insert into names(name) values(?);");
auto stmt_folder_insert = db.prepare("insert into folders(parent_id, name_id, size, cdate) values(?, (select id from names where name=?), ?, ?);");
auto stmt_file_insert = db.prepare("insert into files(folder_id, name_id, size, cdate) values(?, (select id from names where name=?), ?, ?);");
auto stmt_folder_id = db.prepare("select id from folders where parent_id=? and name_id=(select id from names where name=?);");
const root_folder_name = ".";
auto fd = file(ls_fname, "r");
auto line, folder_id, rc, name, fcdate, fsize, record, file_count = 0, folder_count = 0, folder_total_size = 0, file_total_size = 0, count = 0;
db.exec_dml("begin;");
//insert the root folder
stmt_name_insert.bind_exec(root_folder_name);
print(stmt_folder_insert.bind_exec(NULL, root_folder_name, 0, "0000-00-00"));
folder_id = 1;
while((line = fd.read_line()))
{
//if(++count > 10) break;
if(line.len() == 0) continue;
auto line_type = line[0];
switch(line_type)
{
case 'd': //folder
case '-': //file
//record = line.split(' ');
//
//*** The last separator should be only one space, because a filename can start with spaces
record = line.match("(%S+)%s+(%S+)%s+(%S+)%s+(%S+)%s+(%S+)%s+(%S+) (.+)");
//
//print(record.len(), line);
//print(record.len(), record[6]);
name = record[6];
fsize = record[2].tointeger();
fcdate = format("%s %s", record[3], record[4].slice(0, 8));
stmt_name_insert.bind_exec(name);
if(line_type == '-') {
++file_count;
file_total_size += fsize;
rc = stmt_file_insert.bind_exec(folder_id, name, fsize, fcdate);
} else {
++folder_count;
folder_total_size += fsize;
rc = stmt_folder_insert.bind_exec(folder_id, name, fsize, fcdate);
}
if(rc != db.SQLITE_DONE) print("++FAILED", rc, folder_id, name, fsize, fcdate, line);
break;
case '.': //folder header
//print(0, line);
record = line.split('/');
auto last_idx = record.len() -1;
foreach(idx, name in record)
{
if(idx > 0) {
//file/folder names can end with ':', we are only cutting in the last name
if(idx == last_idx && name[name.len()-1] == ':') name = name.slice(0, -1);
//stmt_name_insert.bind_exec(name);
//stmt_folder_insert.bind_exec(folder_id, name);
folder_id = stmt_folder_id.bind_exec_get_one(folder_id, name);
if(type(folder_id) != "integer") print("**FAILED", folder_id, name, line);
//folder_id = db.changes() ? db.last_row_id() : stmt_folder_id.bind_exec_get_one(folder_id, name);
}
else folder_id = 1;
}
break;
case 'l': //symbolic link
break;
}
}
db.exec_dml("commit;");
print(folder_count, folder_total_size, file_count, file_total_size);
fd.close();
db.close();
====
Cheers !