123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- import sqlite3
- import shutil
- from tqdm import tqdm
- # 连接到原始数据库
- conn = sqlite3.connect('file_data.db')
- cursor = conn.cursor()
- # 连接到新的数据库
- conn_new = sqlite3.connect('file_data_2.db')
- cursor_new = conn_new.cursor()
- # 创建新表格
- cursor_new.execute('''
- CREATE TABLE IF NOT EXISTS files (
- id TEXT PRIMARY KEY,
- path TEXT,
- name TEXT,
- type TEXT,
- size INTEGER,
- modification_time TIMESTAMP,
- md5 TEXT,
- source_database TEXT
- )
- ''')
- # 找到具有重复 MD5 哈希的文件记录
- cursor.execute('''
- SELECT md5, group_concat(id)
- FROM files
- GROUP BY md5
- HAVING COUNT(md5) > 1
- ''')
- duplicate_md5_rows = cursor.fetchall()
- for md5_value, id_list in tqdm(duplicate_md5_rows):
- # Convert id_list to a tuple of ids
- ids = tuple(id_list.split(','))
- # Getting duplicate files
- cursor.execute('''
- SELECT DISTINCT id, path, name, type, size, modification_time, md5
- FROM files
- WHERE id IN ({}) -- Use a placeholder for ids
- '''.format(','.join(['?'] * len(ids))), ids) # Pass ids as parameters
- duplicate_files = cursor.fetchall()
- for file_data in duplicate_files:
- id_value, path, name, file_type, size, modification_time, md5_value = file_data
- source_database = 'file_data.db'
- # Check if the record already exists in the new database
- cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
- existing_id = cursor_new.fetchone()
- if not existing_id:
- cursor_new.execute(
- 'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
- (id_value, path, name, file_type, size, modification_time, md5_value, source_database))
- conn_new.commit()
- # 关闭数据库连接
- conn.close()
- conn_new.close()
|