john 1 year ago
commit
b048bfedc8
6 changed files with 233 additions and 0 deletions
  1. 6 0
      .gitignore
  2. 22 0
      checksum.py
  3. 71 0
      diff.py
  4. 16 0
      diskwalk.py
  5. 59 0
      getDeff.py
  6. 59 0
      main.py

+ 6 - 0
.gitignore

@@ -0,0 +1,6 @@
+*.db
+*.csv
+
+file/
+
+.idea/

+ 22 - 0
checksum.py

@@ -0,0 +1,22 @@
+'''
+遇到问题没人解答?小编创建了一个Python学习交流QQ群:778463939
+寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
+'''
+# coding: utf-8
+
+__author__ = "lau.wenbo"
+
+import hashlib,sys
+
+# 分块读MD,速度快
+
+def create_checksum(path):
+    fp = open(path)
+    checksum = hashlib.md5()
+    while True:
+        buffer = fp.read(8192)
+        if not buffer: break
+        checksum.update(buffer)
+    fp.close()
+    checksum = checksum.digest()
+    return checksum

+ 71 - 0
diff.py

@@ -0,0 +1,71 @@
+import os
+import sqlite3
+import hashlib
+import concurrent.futures
+
+# 连接到 SQLite 数据库并创建表
+conn = sqlite3.connect('file_data.db')
+cursor = conn.cursor()
+
+# 创建表格
+cursor.execute('''
+    CREATE TABLE IF NOT EXISTS files (
+        id TEXT PRIMARY KEY,
+        path TEXT,
+        name TEXT,
+        type TEXT,
+        size INTEGER,
+        modification_time TIMESTAMP,
+        md5 TEXT
+    )
+''')
+
+def calculate_md5(file_path):
+    md5_hash = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5_hash.update(chunk)
+    return md5_hash.hexdigest()
+
+def insert_file_data(directory):
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_name, file_extension = os.path.splitext(file)
+            file_type = file_extension[1:]  # Remove the leading dot from extension
+
+            # Get file stats
+            stat_info = os.stat(file_path)
+            size = stat_info.st_size
+            modification_time = stat_info.st_mtime
+
+            # Calculate MD5 hash
+            # md5_hash = hashlib.md5()
+            # with open(file_path, "rb") as f:
+            #     for chunk in iter(lambda: f.read(4096), b""):
+            #         md5_hash.update(chunk)
+            # md5_digest = md5_hash.hexdigest()
+
+
+
+            # 检查是否存在相同ID
+            cursor.execute('SELECT id FROM files WHERE id=?', (file_path,))
+            existing_id = cursor.fetchone()
+
+            if not existing_id:
+                md5_digest = calculate_md5(file_path)
+                cursor.execute(
+                    'INSERT INTO files (id, path, name, type, size, modification_time, md5) VALUES (?, ?, ?, ?, ?, ?, ?)',
+                    (file_path, file_path, file_name, file_type, size, modification_time, md5_digest))
+                conn.commit()
+
+
+# target_directory = '/path/to/your/directory'
+# target_directory = '/Users/honghaitao/PycharmProjects/pythonProject/diff_file/file'
+# target_directory = '/Volumes/16T/柚木'
+# target_directory = '/Volumes/20T/待归类'
+target_directory = '/Volumes/16T/电视剧'
+insert_file_data(target_directory)
+
+# 关闭数据库连接
+conn.close()

+ 16 - 0
diskwalk.py

@@ -0,0 +1,16 @@
+# coding: utf-8
+
+__author__ = "lau.wenbo"
+
+
+import os,sys
+
+
+class diskwalk(object):
+    def __init__(self, path):
+        self.path = path
+    def paths(self):
+        path = self.path
+        # 这里用了一个迭代器逻辑,防止所有数据塞内存爆掉
+        path_collection = (os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files)
+        return path_collection

+ 59 - 0
getDeff.py

@@ -0,0 +1,59 @@
+import sqlite3
+import shutil
+
+# 连接到原始数据库
+conn = sqlite3.connect('file_data.db')
+cursor = conn.cursor()
+
+# 连接到新的数据库
+conn_new = sqlite3.connect('file_data_2.db')
+cursor_new = conn_new.cursor()
+
+# 创建新表格
+cursor_new.execute('''
+    CREATE TABLE IF NOT EXISTS files (
+        id TEXT PRIMARY KEY,
+        path TEXT,
+        name TEXT,
+        type TEXT,
+        size INTEGER,
+        modification_time TIMESTAMP,
+        md5 TEXT,
+        source_database TEXT
+    )
+''')
+
+# 查找相同 MD5 哈希值的数据
+cursor.execute('SELECT md5 FROM files GROUP BY md5 HAVING COUNT(md5) > 1')
+duplicate_md5_rows = cursor.fetchall()
+
+for md5_row in duplicate_md5_rows:
+    md5_value = md5_row[0]
+    cursor.execute('SELECT * FROM files WHERE md5=?', (md5_value,))
+    duplicate_files = cursor.fetchall()
+
+    # 检查相同的 ID,避免重复插入
+    existing_ids = set()
+
+    for file_data in duplicate_files:
+        id_value = file_data[0]
+
+        if id_value not in existing_ids:
+            # 检查新数据库中是否已存在相同ID的记录
+            cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
+            existing_id = cursor_new.fetchone()
+
+            if not existing_id:
+                file_data_with_source = list(file_data)
+                file_data_with_source.append('file_data.db')  # 添加数据来源
+                cursor_new.execute(
+                    'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+                    file_data_with_source)
+
+            existing_ids.add(id_value)  # 将已存在的 ID 加入集合
+
+    conn_new.commit()
+
+# 关闭数据库连接
+conn.close()
+conn_new.close()

+ 59 - 0
main.py

@@ -0,0 +1,59 @@
+'''
+遇到问题没人解答?小编创建了一个Python学习交流QQ群:778463939
+寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
+'''
+# coding: utf-8
+
+__author__ = "lau.wenbo"
+
+from checksum import create_checksum
+from diskwalk import diskwalk
+from os.path import getsize
+import csv
+import os
+import sys
+
+
+# reload(sys)
+# sys.setdefaultencoding('utf8')
+
+
+def findDupes(path):
+    record = {}
+    dup = {}
+    d = diskwalk(path)
+    files = d.paths()
+    for file in files:
+        try:
+            # 这里使用了大小,文件名的对比方式,如果你需要MD5值的对比方式,可以打开下面的注释
+            # compound_key = (getsize(file),create_checksum(file))
+            compound_key = (getsize(file), file.split("/")[-1])
+            if compound_key in record:
+                dup[file] = record[compound_key]
+            else:
+                record[compound_key] = file
+        except:
+            continue
+    return dup
+
+
+if __name__ == '__main__':
+    path = '/Volumes/16T/柚木'
+    # path = './file/'
+    csv_path = './file/'
+    # if not os.path.isdir(path) or not os.path.isdir(csv_path) or csv_path[-1] != "/":
+    #     print("参数不是一个有效的文件夹!")
+    #     exit()
+    # else:
+    # path = path.decode("utf-8")
+    print("待检测的文件夹为{path}".format(path=path))
+    with open(u"{csv_path}重复文件.csv".format(csv_path=csv_path), "w+") as csvfile:
+        # 源文件 重复文件
+        header = ["Source", "Duplicate", "Size"]
+        writer = csv.DictWriter(csvfile, fieldnames=header)
+        writer.writeheader()
+        print("开始遍历文件夹,寻找重复文件,请等待.........")
+        print("开始写入CSV文件,请等待........")
+        for file in findDupes(path).items():
+            print(file)
+            writer.writerow({"Source": file[1], "Duplicate": file[0], "Size": getsize(file[0])})