1 жил өмнө · 69861f5c79
--- a/getDeff.py
+++ b/getDeff.py
@@ -1,5 +1,6 @@
 
				 import sqlite3
			
 
				 import shutil
			
 
				+from tqdm import tqdm
			
 
				 
			
 
				 # 连接到原始数据库
			
 
				 conn = sqlite3.connect('file_data.db')
			
@@ -23,34 +24,40 @@ cursor_new.execute('''
 
				     )
			
 
				 ''')
			
 
				 
			
 
				-# 查找相同 MD5 哈希值的数据
			
 
				-cursor.execute('SELECT md5 FROM files GROUP BY md5 HAVING COUNT(md5) > 1')
			
 
				+# 找到具有重复 MD5 哈希的文件记录
			
 
				+cursor.execute('''
			
 
				+    SELECT md5, group_concat(id) 
			
 
				+    FROM files 
			
 
				+    GROUP BY md5 
			
 
				+    HAVING COUNT(md5) > 1
			
 
				+''')
			
 
				 duplicate_md5_rows = cursor.fetchall()
			
 
				 
			
 
				-for md5_row in duplicate_md5_rows:
			
 
				-    md5_value = md5_row[0]
			
 
				-    cursor.execute('SELECT * FROM files WHERE md5=?', (md5_value,))
			
 
				-    duplicate_files = cursor.fetchall()
			
 
				+for md5_value, id_list in tqdm(duplicate_md5_rows):
			
 
				+    # Convert id_list to a tuple of ids
			
 
				+    ids = tuple(id_list.split(','))
			
 
				 
			
 
				-    # 检查相同的 ID，避免重复插入
			
 
				-    existing_ids = set()
			
 
				+    # Getting duplicate files
			
 
				+    cursor.execute('''
			
 
				+        SELECT DISTINCT id, path, name, type, size, modification_time, md5 
			
 
				+        FROM files 
			
 
				+        WHERE id IN ({})  -- Use a placeholder for ids
			
 
				+    '''.format(','.join(['?'] * len(ids))), ids)  # Pass ids as parameters
			
 
				 
			
 
				-    for file_data in duplicate_files:
			
 
				-        id_value = file_data[0]
			
 
				+    duplicate_files = cursor.fetchall()
			
 
				 
			
 
				-        if id_value not in existing_ids:
			
 
				-            # 检查新数据库中是否已存在相同ID的记录
			
 
				-            cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
			
 
				-            existing_id = cursor_new.fetchone()
			
 
				+    for file_data in duplicate_files:
			
 
				+        id_value, path, name, file_type, size, modification_time, md5_value = file_data
			
 
				+        source_database = 'file_data.db'
			
 
				 
			
 
				-            if not existing_id:
			
 
				-                file_data_with_source = list(file_data)
			
 
				-                file_data_with_source.append('file_data.db')  # 添加数据来源
			
 
				-                cursor_new.execute(
			
 
				-                    'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
			
 
				-                    file_data_with_source)
			
 
				+        # Check if the record already exists in the new database
			
 
				+        cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
			
 
				+        existing_id = cursor_new.fetchone()
			
 
				 
			
 
				-            existing_ids.add(id_value)  # 将已存在的 ID 加入集合
			
 
				+        if not existing_id:
			
 
				+            cursor_new.execute(
			
 
				+                'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
			
 
				+                (id_value, path, name, file_type, size, modification_time, md5_value, source_database))
			
 
				 
			
 
				     conn_new.commit()