123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- import os
- import sqlite3
- import hashlib
- from tqdm import tqdm
- import concurrent.futures
- # 连接到 SQLite 数据库并创建表
- conn = sqlite3.connect('file_data.db')
- cursor = conn.cursor()
- # 创建表格
- cursor.execute('''
- CREATE TABLE IF NOT EXISTS files (
- id TEXT PRIMARY KEY,
- path TEXT,
- name TEXT,
- type TEXT,
- size INTEGER,
- modification_time TIMESTAMP,
- md5 TEXT
- )
- ''')
- def calculate_md5(file_path):
- md5_hash = hashlib.md5()
- with open(file_path, "rb") as f:
- for chunk in iter(lambda: f.read(4096), b""):
- md5_hash.update(chunk)
- return md5_hash.hexdigest()
- def insert_file_data(directory, ignore_list):
- total_files = 0 # 用于计算总文件数
- # total_files = 78416 # 用于计算总文件数
- for root, _, files in os.walk(directory):
- total_files += len(files) # 增加目录下的文件数
- pbar = tqdm(total=total_files, unit="file") # 使用总文件数初始化进度条
- for root, _, files in os.walk(directory):
- for file in files:
- file_path = os.path.join(root, file)
- if os.path.exists(file_path): # 检查文件是否存在
- file_name, file_extension = os.path.splitext(file)
- file_type = file_extension[1:] # Remove the leading dot from extension
- # 设置文件名称作为进度条后缀
- pbar.set_postfix(file_path=root)
- pbar.update(1) # 每处理完一个文件,更新进度条
- # 检查是否在忽略列表中(文件名或目录名)
- should_ignore = False
- for ignore_item in ignore_list:
- if ignore_item in file_path:
- should_ignore = True
- break
- if ignore_item in file_type:
- should_ignore = True
- break
- if should_ignore:
- continue
- # Get file stats
- stat_info = os.stat(file_path)
- size = stat_info.st_size
- modification_time = stat_info.st_mtime
- # 检查是否存在相同ID
- cursor.execute('SELECT id FROM files WHERE id=?', (file_path,))
- existing_id = cursor.fetchone()
- if not existing_id:
- md5_digest = calculate_md5(file_path)
- cursor.execute(
- 'INSERT INTO files (id, path, name, type, size, modification_time, md5) VALUES (?, ?, ?, ?, ?, ?, ?)',
- (file_path, file_path, file_name, file_type, size, modification_time, md5_digest))
- conn.commit()
- else:
- print(f"文件不存在: {file_path}")
- ignore_list = [
- 'node_modules',
- '.idea',
- 'jar',
- '.git',
- '.DS_Store',
- 'CleanMyMac X.app',
- '.pnpm-store',
- 'IINA.app',
- 'venv',
- '.Spotlight-V100',
- '.fseventsd',
- 'python',
- '/Volumes/16T/newFiles/开放/可执行文件/bin/bit'
- ]
- # target_directory = '/path/to/your/directory'
- # target_directory = '/Users/honghaitao/PycharmProjects/pythonProject/diff_file/file'
- # target_directory = '/Volumes/16T/柚木'
- # target_directory = '/Volumes/20T/待归类'
- # target_directory = '/Volumes/16T/电视剧'
- # target_directory = '/Volumes/16T/电影'
- # target_directory = '/Volumes/16T/电子书'
- # target_directory = '/Volumes/16T/工作'
- target_directory = '/Volumes/16T/柚木'
- # target_directory = '/Volumes/16T'
- # target_directory = '/Volumes/16T/黑苹果备份请勿删除'
- # target_directory = '/Volumes/16T/纪录片'
- # target_directory = '/Volumes/16T/漫画'
- # target_directory = '/Volumes/16T/软件'
- # target_directory = '/Volumes/16T/数据备份'
- # target_directory = '/Volumes/16T/图片'
- # target_directory = '/Volumes/16T/西南大学'
- # target_directory = '/Volumes/16T/学习'
- # target_directory = '/Volumes/16T/游戏'
- # target_directory = '/Volumes/16T/娱乐'
- # target_directory = '/Volumes/16T/资料'
- # target_directory = '/Volumes/16T/BaiduNetdiskDownload'
- # target_directory = '/Volumes/16T/bookBack'
- # target_directory = '/Volumes/16T/Calibre 书库'
- # target_directory = '/Volumes/16T/code'
- # target_directory = '/Volumes/16T/codeServer'
- # target_directory = '/Volumes/16T/docker'
- # target_directory = '/Volumes/16T/Everything.efu'
- # target_directory = '/Volumes/16T/gitprojectserver'
- # target_directory = '/Volumes/16T/iCloud云盘(归档)'
- # target_directory = '/Volumes/16T/newFiles'
- # target_directory = '/Volumes/16T/obsidiaProject'
- # target_directory = '/Volumes/16T/overload骨王小说插画版'
- insert_file_data(target_directory, ignore_list)
- # 关闭数据库连接
- conn.close()
|