Эх сурвалжийг харах

7: 增加可视化进度条

john 1 жил өмнө
parent
commit
69861f5c79
1 өөрчлөгдсөн 28 нэмэгдсэн , 21 устгасан
  1. 28 21
      getDeff.py

+ 28 - 21
getDeff.py

@@ -1,5 +1,6 @@
 import sqlite3
 import shutil
+from tqdm import tqdm
 
 # 连接到原始数据库
 conn = sqlite3.connect('file_data.db')
@@ -23,34 +24,40 @@ cursor_new.execute('''
     )
 ''')
 
-# 查找相同 MD5 哈希值的数据
-cursor.execute('SELECT md5 FROM files GROUP BY md5 HAVING COUNT(md5) > 1')
+# 找到具有重复 MD5 哈希的文件记录
+cursor.execute('''
+    SELECT md5, group_concat(id) 
+    FROM files 
+    GROUP BY md5 
+    HAVING COUNT(md5) > 1
+''')
 duplicate_md5_rows = cursor.fetchall()
 
-for md5_row in duplicate_md5_rows:
-    md5_value = md5_row[0]
-    cursor.execute('SELECT * FROM files WHERE md5=?', (md5_value,))
-    duplicate_files = cursor.fetchall()
+for md5_value, id_list in tqdm(duplicate_md5_rows):
+    # Convert id_list to a tuple of ids
+    ids = tuple(id_list.split(','))
 
-    # 检查相同的 ID,避免重复插入
-    existing_ids = set()
+    # Getting duplicate files
+    cursor.execute('''
+        SELECT DISTINCT id, path, name, type, size, modification_time, md5 
+        FROM files 
+        WHERE id IN ({})  -- Use a placeholder for ids
+    '''.format(','.join(['?'] * len(ids))), ids)  # Pass ids as parameters
 
-    for file_data in duplicate_files:
-        id_value = file_data[0]
+    duplicate_files = cursor.fetchall()
 
-        if id_value not in existing_ids:
-            # 检查新数据库中是否已存在相同ID的记录
-            cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
-            existing_id = cursor_new.fetchone()
+    for file_data in duplicate_files:
+        id_value, path, name, file_type, size, modification_time, md5_value = file_data
+        source_database = 'file_data.db'
 
-            if not existing_id:
-                file_data_with_source = list(file_data)
-                file_data_with_source.append('file_data.db')  # 添加数据来源
-                cursor_new.execute(
-                    'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
-                    file_data_with_source)
+        # Check if the record already exists in the new database
+        cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
+        existing_id = cursor_new.fetchone()
 
-            existing_ids.add(id_value)  # 将已存在的 ID 加入集合
+        if not existing_id:
+            cursor_new.execute(
+                'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+                (id_value, path, name, file_type, size, modification_time, md5_value, source_database))
 
     conn_new.commit()