|
@@ -1,5 +1,6 @@
|
|
|
import sqlite3
|
|
|
import shutil
|
|
|
+from tqdm import tqdm
|
|
|
|
|
|
# 连接到原始数据库
|
|
|
conn = sqlite3.connect('file_data.db')
|
|
@@ -23,34 +24,40 @@ cursor_new.execute('''
|
|
|
)
|
|
|
''')
|
|
|
|
|
|
-# 查找相同 MD5 哈希值的数据
|
|
|
-cursor.execute('SELECT md5 FROM files GROUP BY md5 HAVING COUNT(md5) > 1')
|
|
|
+# 找到具有重复 MD5 哈希的文件记录
|
|
|
+cursor.execute('''
|
|
|
+ SELECT md5, group_concat(id)
|
|
|
+ FROM files
|
|
|
+ GROUP BY md5
|
|
|
+ HAVING COUNT(md5) > 1
|
|
|
+''')
|
|
|
duplicate_md5_rows = cursor.fetchall()
|
|
|
|
|
|
-for md5_row in duplicate_md5_rows:
|
|
|
- md5_value = md5_row[0]
|
|
|
- cursor.execute('SELECT * FROM files WHERE md5=?', (md5_value,))
|
|
|
- duplicate_files = cursor.fetchall()
|
|
|
+for md5_value, id_list in tqdm(duplicate_md5_rows):
|
|
|
+ # Convert id_list to a tuple of ids
|
|
|
+ ids = tuple(id_list.split(','))
|
|
|
|
|
|
- # 检查相同的 ID,避免重复插入
|
|
|
- existing_ids = set()
|
|
|
+ # Getting duplicate files
|
|
|
+ cursor.execute('''
|
|
|
+ SELECT DISTINCT id, path, name, type, size, modification_time, md5
|
|
|
+ FROM files
|
|
|
+ WHERE id IN ({}) -- Use a placeholder for ids
|
|
|
+ '''.format(','.join(['?'] * len(ids))), ids) # Pass ids as parameters
|
|
|
|
|
|
- for file_data in duplicate_files:
|
|
|
- id_value = file_data[0]
|
|
|
+ duplicate_files = cursor.fetchall()
|
|
|
|
|
|
- if id_value not in existing_ids:
|
|
|
- # 检查新数据库中是否已存在相同ID的记录
|
|
|
- cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
|
|
|
- existing_id = cursor_new.fetchone()
|
|
|
+ for file_data in duplicate_files:
|
|
|
+ id_value, path, name, file_type, size, modification_time, md5_value = file_data
|
|
|
+ source_database = 'file_data.db'
|
|
|
|
|
|
- if not existing_id:
|
|
|
- file_data_with_source = list(file_data)
|
|
|
- file_data_with_source.append('file_data.db') # 添加数据来源
|
|
|
- cursor_new.execute(
|
|
|
- 'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
|
|
|
- file_data_with_source)
|
|
|
+ # Check if the record already exists in the new database
|
|
|
+ cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
|
|
|
+ existing_id = cursor_new.fetchone()
|
|
|
|
|
|
- existing_ids.add(id_value) # 将已存在的 ID 加入集合
|
|
|
+ if not existing_id:
|
|
|
+ cursor_new.execute(
|
|
|
+ 'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
|
|
|
+ (id_value, path, name, file_type, size, modification_time, md5_value, source_database))
|
|
|
|
|
|
conn_new.commit()
|
|
|
|