import sqlite3 import shutil from tqdm import tqdm # 连接到原始数据库 conn = sqlite3.connect('file_data.db') cursor = conn.cursor() # 连接到新的数据库 conn_new = sqlite3.connect('file_data_2.db') cursor_new = conn_new.cursor() # 创建新表格 cursor_new.execute(''' CREATE TABLE IF NOT EXISTS files ( id TEXT PRIMARY KEY, path TEXT, name TEXT, type TEXT, size INTEGER, modification_time TIMESTAMP, md5 TEXT, source_database TEXT ) ''') # 找到具有重复 MD5 哈希的文件记录 cursor.execute(''' SELECT md5, group_concat(id) FROM files GROUP BY md5 HAVING COUNT(md5) > 1 ''') duplicate_md5_rows = cursor.fetchall() for md5_value, id_list in tqdm(duplicate_md5_rows): # Convert id_list to a tuple of ids ids = tuple(id_list.split(',')) # Getting duplicate files cursor.execute(''' SELECT DISTINCT id, path, name, type, size, modification_time, md5 FROM files WHERE id IN ({}) -- Use a placeholder for ids '''.format(','.join(['?'] * len(ids))), ids) # Pass ids as parameters duplicate_files = cursor.fetchall() for file_data in duplicate_files: id_value, path, name, file_type, size, modification_time, md5_value = file_data source_database = 'file_data.db' # Check if the record already exists in the new database cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,)) existing_id = cursor_new.fetchone() if not existing_id: cursor_new.execute( 'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', (id_value, path, name, file_type, size, modification_time, md5_value, source_database)) conn_new.commit() # 关闭数据库连接 conn.close() conn_new.close()