getDeff.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import sqlite3
  2. import shutil
  3. from tqdm import tqdm
  4. # 连接到原始数据库
  5. conn = sqlite3.connect('file_data.db')
  6. cursor = conn.cursor()
  7. # 连接到新的数据库
  8. conn_new = sqlite3.connect('file_data_2.db')
  9. cursor_new = conn_new.cursor()
  10. # 创建新表格
  11. cursor_new.execute('''
  12. CREATE TABLE IF NOT EXISTS files (
  13. id TEXT PRIMARY KEY,
  14. path TEXT,
  15. name TEXT,
  16. type TEXT,
  17. size INTEGER,
  18. modification_time TIMESTAMP,
  19. md5 TEXT,
  20. source_database TEXT
  21. )
  22. ''')
  23. # 找到具有重复 MD5 哈希的文件记录
  24. cursor.execute('''
  25. SELECT md5, group_concat(id)
  26. FROM files
  27. GROUP BY md5
  28. HAVING COUNT(md5) > 1
  29. ''')
  30. duplicate_md5_rows = cursor.fetchall()
  31. for md5_value, id_list in tqdm(duplicate_md5_rows):
  32. # Convert id_list to a tuple of ids
  33. ids = tuple(id_list.split(','))
  34. # Getting duplicate files
  35. cursor.execute('''
  36. SELECT DISTINCT id, path, name, type, size, modification_time, md5
  37. FROM files
  38. WHERE id IN ({}) -- Use a placeholder for ids
  39. '''.format(','.join(['?'] * len(ids))), ids) # Pass ids as parameters
  40. duplicate_files = cursor.fetchall()
  41. for file_data in duplicate_files:
  42. id_value, path, name, file_type, size, modification_time, md5_value = file_data
  43. source_database = 'file_data.db'
  44. # Check if the record already exists in the new database
  45. cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
  46. existing_id = cursor_new.fetchone()
  47. if not existing_id:
  48. cursor_new.execute(
  49. 'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
  50. (id_value, path, name, file_type, size, modification_time, md5_value, source_database))
  51. conn_new.commit()
  52. # 关闭数据库连接
  53. conn.close()
  54. conn_new.close()