diff.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. import os
  2. import sqlite3
  3. import hashlib
  4. from tqdm import tqdm
  5. import concurrent.futures
  6. # 连接到 SQLite 数据库并创建表
  7. conn = sqlite3.connect('file_data.db')
  8. cursor = conn.cursor()
  9. # 创建表格
  10. cursor.execute('''
  11. CREATE TABLE IF NOT EXISTS files (
  12. id TEXT PRIMARY KEY,
  13. path TEXT,
  14. name TEXT,
  15. type TEXT,
  16. size INTEGER,
  17. modification_time TIMESTAMP,
  18. md5 TEXT
  19. )
  20. ''')
  21. def calculate_md5(file_path):
  22. md5_hash = hashlib.md5()
  23. with open(file_path, "rb") as f:
  24. for chunk in iter(lambda: f.read(4096), b""):
  25. md5_hash.update(chunk)
  26. return md5_hash.hexdigest()
  27. def insert_file_data(directory, ignore_list):
  28. total_files = 0 # 用于计算总文件数
  29. # total_files = 78416 # 用于计算总文件数
  30. for root, _, files in os.walk(directory):
  31. total_files += len(files) # 增加目录下的文件数
  32. pbar = tqdm(total=total_files, unit="file") # 使用总文件数初始化进度条
  33. for root, _, files in os.walk(directory):
  34. for file in files:
  35. file_path = os.path.join(root, file)
  36. if os.path.exists(file_path): # 检查文件是否存在
  37. file_name, file_extension = os.path.splitext(file)
  38. file_type = file_extension[1:] # Remove the leading dot from extension
  39. # 设置文件名称作为进度条后缀
  40. pbar.set_postfix(file_path=root)
  41. pbar.update(1) # 每处理完一个文件,更新进度条
  42. # 检查是否在忽略列表中(文件名或目录名)
  43. should_ignore = False
  44. for ignore_item in ignore_list:
  45. if ignore_item in file_path:
  46. should_ignore = True
  47. break
  48. if ignore_item in file_type:
  49. should_ignore = True
  50. break
  51. if should_ignore:
  52. continue
  53. # Get file stats
  54. stat_info = os.stat(file_path)
  55. size = stat_info.st_size
  56. modification_time = stat_info.st_mtime
  57. # 检查是否存在相同ID
  58. cursor.execute('SELECT id FROM files WHERE id=?', (file_path,))
  59. existing_id = cursor.fetchone()
  60. if not existing_id:
  61. md5_digest = calculate_md5(file_path)
  62. cursor.execute(
  63. 'INSERT INTO files (id, path, name, type, size, modification_time, md5) VALUES (?, ?, ?, ?, ?, ?, ?)',
  64. (file_path, file_path, file_name, file_type, size, modification_time, md5_digest))
  65. conn.commit()
  66. else:
  67. print(f"文件不存在: {file_path}")
  68. ignore_list = [
  69. 'node_modules',
  70. '.idea',
  71. 'jar',
  72. '.git',
  73. '.DS_Store',
  74. 'CleanMyMac X.app',
  75. '.pnpm-store',
  76. 'IINA.app',
  77. 'venv',
  78. '.Spotlight-V100',
  79. '.fseventsd',
  80. 'python',
  81. '/Volumes/16T/newFiles/开放/可执行文件/bin/bit'
  82. ]
  83. # target_directory = '/path/to/your/directory'
  84. # target_directory = '/Users/honghaitao/PycharmProjects/pythonProject/diff_file/file'
  85. # target_directory = '/Volumes/16T/柚木'
  86. # target_directory = '/Volumes/20T/待归类'
  87. # target_directory = '/Volumes/16T/电视剧'
  88. # target_directory = '/Volumes/16T/电影'
  89. # target_directory = '/Volumes/16T/电子书'
  90. # target_directory = '/Volumes/16T/工作'
  91. target_directory = '/Volumes/16T/柚木'
  92. # target_directory = '/Volumes/16T'
  93. # target_directory = '/Volumes/16T/黑苹果备份请勿删除'
  94. # target_directory = '/Volumes/16T/纪录片'
  95. # target_directory = '/Volumes/16T/漫画'
  96. # target_directory = '/Volumes/16T/软件'
  97. # target_directory = '/Volumes/16T/数据备份'
  98. # target_directory = '/Volumes/16T/图片'
  99. # target_directory = '/Volumes/16T/西南大学'
  100. # target_directory = '/Volumes/16T/学习'
  101. # target_directory = '/Volumes/16T/游戏'
  102. # target_directory = '/Volumes/16T/娱乐'
  103. # target_directory = '/Volumes/16T/资料'
  104. # target_directory = '/Volumes/16T/BaiduNetdiskDownload'
  105. # target_directory = '/Volumes/16T/bookBack'
  106. # target_directory = '/Volumes/16T/Calibre 书库'
  107. # target_directory = '/Volumes/16T/code'
  108. # target_directory = '/Volumes/16T/codeServer'
  109. # target_directory = '/Volumes/16T/docker'
  110. # target_directory = '/Volumes/16T/Everything.efu'
  111. # target_directory = '/Volumes/16T/gitprojectserver'
  112. # target_directory = '/Volumes/16T/iCloud云盘(归档)'
  113. # target_directory = '/Volumes/16T/newFiles'
  114. # target_directory = '/Volumes/16T/obsidiaProject'
  115. # target_directory = '/Volumes/16T/overload骨王小说插画版'
  116. insert_file_data(target_directory, ignore_list)
  117. # 关闭数据库连接
  118. conn.close()