1 year ago · b048bfedc8
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,6 @@
 
				+*.db
			
 
				+*.csv
			
 
				+
			
 
				+file/
			
 
				+
			
 
				+.idea/
			
--- a/checksum.py
+++ b/checksum.py
@@ -0,0 +1,22 @@
 
				+'''
			
 
				+遇到问题没人解答？小编创建了一个Python学习交流QQ群：778463939
			
 
				+寻找有志同道合的小伙伴，互帮互助,群里还有不错的视频学习教程和PDF电子书！
			
 
				+'''
			
 
				+# coding: utf-8
			
 
				+
			
 
				+__author__ = "lau.wenbo"
			
 
				+
			
 
				+import hashlib,sys
			
 
				+
			
 
				+# 分块读MD，速度快
			
 
				+
			
 
				+def create_checksum(path):
			
 
				+    fp = open(path)
			
 
				+    checksum = hashlib.md5()
			
 
				+    while True:
			
 
				+        buffer = fp.read(8192)
			
 
				+        if not buffer: break
			
 
				+        checksum.update(buffer)
			
 
				+    fp.close()
			
 
				+    checksum = checksum.digest()
			
 
				+    return checksum
			
--- a/diff.py
+++ b/diff.py
@@ -0,0 +1,71 @@
 
				+import os
			
 
				+import sqlite3
			
 
				+import hashlib
			
 
				+import concurrent.futures
			
 
				+
			
 
				+# 连接到 SQLite 数据库并创建表
			
 
				+conn = sqlite3.connect('file_data.db')
			
 
				+cursor = conn.cursor()
			
 
				+
			
 
				+# 创建表格
			
 
				+cursor.execute('''
			
 
				+    CREATE TABLE IF NOT EXISTS files (
			
 
				+        id TEXT PRIMARY KEY,
			
 
				+        path TEXT,
			
 
				+        name TEXT,
			
 
				+        type TEXT,
			
 
				+        size INTEGER,
			
 
				+        modification_time TIMESTAMP,
			
 
				+        md5 TEXT
			
 
				+    )
			
 
				+''')
			
 
				+
			
 
				+def calculate_md5(file_path):
			
 
				+    md5_hash = hashlib.md5()
			
 
				+    with open(file_path, "rb") as f:
			
 
				+        for chunk in iter(lambda: f.read(4096), b""):
			
 
				+            md5_hash.update(chunk)
			
 
				+    return md5_hash.hexdigest()
			
 
				+
			
 
				+def insert_file_data(directory):
			
 
				+    for root, _, files in os.walk(directory):
			
 
				+        for file in files:
			
 
				+            file_path = os.path.join(root, file)
			
 
				+            file_name, file_extension = os.path.splitext(file)
			
 
				+            file_type = file_extension[1:]  # Remove the leading dot from extension
			
 
				+
			
 
				+            # Get file stats
			
 
				+            stat_info = os.stat(file_path)
			
 
				+            size = stat_info.st_size
			
 
				+            modification_time = stat_info.st_mtime
			
 
				+
			
 
				+            # Calculate MD5 hash
			
 
				+            # md5_hash = hashlib.md5()
			
 
				+            # with open(file_path, "rb") as f:
			
 
				+            #     for chunk in iter(lambda: f.read(4096), b""):
			
 
				+            #         md5_hash.update(chunk)
			
 
				+            # md5_digest = md5_hash.hexdigest()
			
 
				+
			
 
				+
			
 
				+
			
 
				+            # 检查是否存在相同ID
			
 
				+            cursor.execute('SELECT id FROM files WHERE id=?', (file_path,))
			
 
				+            existing_id = cursor.fetchone()
			
 
				+
			
 
				+            if not existing_id:
			
 
				+                md5_digest = calculate_md5(file_path)
			
 
				+                cursor.execute(
			
 
				+                    'INSERT INTO files (id, path, name, type, size, modification_time, md5) VALUES (?, ?, ?, ?, ?, ?, ?)',
			
 
				+                    (file_path, file_path, file_name, file_type, size, modification_time, md5_digest))
			
 
				+                conn.commit()
			
 
				+
			
 
				+
			
 
				+# target_directory = '/path/to/your/directory'
			
 
				+# target_directory = '/Users/honghaitao/PycharmProjects/pythonProject/diff_file/file'
			
 
				+# target_directory = '/Volumes/16T/柚木'
			
 
				+# target_directory = '/Volumes/20T/待归类'
			
 
				+target_directory = '/Volumes/16T/电视剧'
			
 
				+insert_file_data(target_directory)
			
 
				+
			
 
				+# 关闭数据库连接
			
 
				+conn.close()
			
--- a/diskwalk.py
+++ b/diskwalk.py
@@ -0,0 +1,16 @@
 
				+# coding: utf-8
			
 
				+
			
 
				+__author__ = "lau.wenbo"
			
 
				+
			
 
				+
			
 
				+import os,sys
			
 
				+
			
 
				+
			
 
				+class diskwalk(object):
			
 
				+    def __init__(self, path):
			
 
				+        self.path = path
			
 
				+    def paths(self):
			
 
				+        path = self.path
			
 
				+        # 这里用了一个迭代器逻辑，防止所有数据塞内存爆掉
			
 
				+        path_collection = (os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files)
			
 
				+        return path_collection
			
--- a/getDeff.py
+++ b/getDeff.py
@@ -0,0 +1,59 @@
 
				+import sqlite3
			
 
				+import shutil
			
 
				+
			
 
				+# 连接到原始数据库
			
 
				+conn = sqlite3.connect('file_data.db')
			
 
				+cursor = conn.cursor()
			
 
				+
			
 
				+# 连接到新的数据库
			
 
				+conn_new = sqlite3.connect('file_data_2.db')
			
 
				+cursor_new = conn_new.cursor()
			
 
				+
			
 
				+# 创建新表格
			
 
				+cursor_new.execute('''
			
 
				+    CREATE TABLE IF NOT EXISTS files (
			
 
				+        id TEXT PRIMARY KEY,
			
 
				+        path TEXT,
			
 
				+        name TEXT,
			
 
				+        type TEXT,
			
 
				+        size INTEGER,
			
 
				+        modification_time TIMESTAMP,
			
 
				+        md5 TEXT,
			
 
				+        source_database TEXT
			
 
				+    )
			
 
				+''')
			
 
				+
			
 
				+# 查找相同 MD5 哈希值的数据
			
 
				+cursor.execute('SELECT md5 FROM files GROUP BY md5 HAVING COUNT(md5) > 1')
			
 
				+duplicate_md5_rows = cursor.fetchall()
			
 
				+
			
 
				+for md5_row in duplicate_md5_rows:
			
 
				+    md5_value = md5_row[0]
			
 
				+    cursor.execute('SELECT * FROM files WHERE md5=?', (md5_value,))
			
 
				+    duplicate_files = cursor.fetchall()
			
 
				+
			
 
				+    # 检查相同的 ID，避免重复插入
			
 
				+    existing_ids = set()
			
 
				+
			
 
				+    for file_data in duplicate_files:
			
 
				+        id_value = file_data[0]
			
 
				+
			
 
				+        if id_value not in existing_ids:
			
 
				+            # 检查新数据库中是否已存在相同ID的记录
			
 
				+            cursor_new.execute('SELECT id FROM files WHERE id=?', (id_value,))
			
 
				+            existing_id = cursor_new.fetchone()
			
 
				+
			
 
				+            if not existing_id:
			
 
				+                file_data_with_source = list(file_data)
			
 
				+                file_data_with_source.append('file_data.db')  # 添加数据来源
			
 
				+                cursor_new.execute(
			
 
				+                    'INSERT INTO files (id, path, name, type, size, modification_time, md5, source_database) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
			
 
				+                    file_data_with_source)
			
 
				+
			
 
				+            existing_ids.add(id_value)  # 将已存在的 ID 加入集合
			
 
				+
			
 
				+    conn_new.commit()
			
 
				+
			
 
				+# 关闭数据库连接
			
 
				+conn.close()
			
 
				+conn_new.close()
			
--- a/main.py
+++ b/main.py
@@ -0,0 +1,59 @@
 
				+'''
			
 
				+遇到问题没人解答？小编创建了一个Python学习交流QQ群：778463939
			
 
				+寻找有志同道合的小伙伴，互帮互助,群里还有不错的视频学习教程和PDF电子书！
			
 
				+'''
			
 
				+# coding: utf-8
			
 
				+
			
 
				+__author__ = "lau.wenbo"
			
 
				+
			
 
				+from checksum import create_checksum
			
 
				+from diskwalk import diskwalk
			
 
				+from os.path import getsize
			
 
				+import csv
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+# reload(sys)
			
 
				+# sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+
			
 
				+def findDupes(path):
			
 
				+    record = {}
			
 
				+    dup = {}
			
 
				+    d = diskwalk(path)
			
 
				+    files = d.paths()
			
 
				+    for file in files:
			
 
				+        try:
			
 
				+            # 这里使用了大小，文件名的对比方式，如果你需要MD5值的对比方式，可以打开下面的注释
			
 
				+            # compound_key = (getsize(file),create_checksum(file))
			
 
				+            compound_key = (getsize(file), file.split("/")[-1])
			
 
				+            if compound_key in record:
			
 
				+                dup[file] = record[compound_key]
			
 
				+            else:
			
 
				+                record[compound_key] = file
			
 
				+        except:
			
 
				+            continue
			
 
				+    return dup
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    path = '/Volumes/16T/柚木'
			
 
				+    # path = './file/'
			
 
				+    csv_path = './file/'
			
 
				+    # if not os.path.isdir(path) or not os.path.isdir(csv_path) or csv_path[-1] != "/":
			
 
				+    #     print("参数不是一个有效的文件夹！")
			
 
				+    #     exit()
			
 
				+    # else:
			
 
				+    # path = path.decode("utf-8")
			
 
				+    print("待检测的文件夹为{path}".format(path=path))
			
 
				+    with open(u"{csv_path}重复文件.csv".format(csv_path=csv_path), "w+") as csvfile:
			
 
				+        # 源文件 重复文件
			
 
				+        header = ["Source", "Duplicate", "Size"]
			
 
				+        writer = csv.DictWriter(csvfile, fieldnames=header)
			
 
				+        writer.writeheader()
			
 
				+        print("开始遍历文件夹，寻找重复文件，请等待.........")
			
 
				+        print("开始写入CSV文件，请等待........")
			
 
				+        for file in findDupes(path).items():
			
 
				+            print(file)
			
 
				+            writer.writerow({"Source": file[1], "Duplicate": file[0], "Size": getsize(file[0])})