main.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. '''
  2. 遇到问题没人解答?小编创建了一个Python学习交流QQ群:778463939
  3. 寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
  4. '''
  5. # coding: utf-8
  6. __author__ = "lau.wenbo"
  7. from checksum import create_checksum
  8. from diskwalk import diskwalk
  9. from os.path import getsize
  10. import csv
  11. import os
  12. import sys
  13. # reload(sys)
  14. # sys.setdefaultencoding('utf8')
  15. def findDupes(path):
  16. record = {}
  17. dup = {}
  18. d = diskwalk(path)
  19. files = d.paths()
  20. for file in files:
  21. try:
  22. # 这里使用了大小,文件名的对比方式,如果你需要MD5值的对比方式,可以打开下面的注释
  23. # compound_key = (getsize(file),create_checksum(file))
  24. compound_key = (getsize(file), file.split("/")[-1])
  25. if compound_key in record:
  26. dup[file] = record[compound_key]
  27. else:
  28. record[compound_key] = file
  29. except:
  30. continue
  31. return dup
  32. if __name__ == '__main__':
  33. path = '/Volumes/16T/柚木'
  34. # path = './file/'
  35. csv_path = './file/'
  36. # if not os.path.isdir(path) or not os.path.isdir(csv_path) or csv_path[-1] != "/":
  37. # print("参数不是一个有效的文件夹!")
  38. # exit()
  39. # else:
  40. # path = path.decode("utf-8")
  41. print("待检测的文件夹为{path}".format(path=path))
  42. with open(u"{csv_path}重复文件.csv".format(csv_path=csv_path), "w+") as csvfile:
  43. # 源文件 重复文件
  44. header = ["Source", "Duplicate", "Size"]
  45. writer = csv.DictWriter(csvfile, fieldnames=header)
  46. writer.writeheader()
  47. print("开始遍历文件夹,寻找重复文件,请等待.........")
  48. print("开始写入CSV文件,请等待........")
  49. for file in findDupes(path).items():
  50. print(file)
  51. writer.writerow({"Source": file[1], "Duplicate": file[0], "Size": getsize(file[0])})