1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- from tqdm import tqdm
- import os
- from chardet import detect
- # 定义日志文件路径
- log_file_path = "conversion_log.txt"
- def detect_file_encoding(file_path):
- """
- 自动检测文件编码。
- :param file_path: 文件路径
- :return: 检测到的编码
- """
- with open(file_path, 'rb') as f:
- raw_data = f.read(1024) # 读取部分数据进行检测
- result = detect(raw_data)
- return result.get('encoding', 'utf-8') # 默认返回 utf-8
- def log_message_to_file(log_file_path, message):
- """
- 将日志信息写入日志文件
- :param log_file_path: 日志文件路径
- :param message: 要记录的日志信息
- """
- try:
- with open(log_file_path, 'a', encoding='utf-8') as log_file:
- log_file.write(message + '\n')
- except Exception as e:
- print(f"无法写入日志文件:{e}")
- def convert_encoding_in_dir(directory, dest_encoding, extensions):
- """
- 转换指定目录下所有指定格式的文件的编码,并显示进度条。
- :param directory: 目录路径
- :param dest_encoding: 目标编码
- :param extensions: 需要处理的文件扩展名列表(如 ['.txt', '.html'])
- """
- # 获取所有指定格式的文件路径
- files_to_process = []
- for root, _, files in os.walk(directory):
- for file in files:
- if any(file.lower().endswith(ext) for ext in extensions): # 检查文件扩展名
- files_to_process.append(os.path.join(root, file))
- # 遍历文件并转换编码
- for file_path in tqdm(files_to_process, desc="转换文件编码进度", unit="文件"):
- try:
- # 自动检测文件编码
- # src_encoding = detect_file_encoding(file_path)
- src_encoding = "GB2312"
- # if src_encoding == 'MacRoman' or src_encoding == 'ascii':
- # src_encoding = "GB2312"
- # 读取并转换文件内容
- with open(file_path, 'r', encoding=src_encoding, errors='ignore') as f:
- content = f.read()
- # 写入目标编码
- with open(file_path, 'w', encoding=dest_encoding) as f:
- f.write(content)
- # 构建日志信息
- log_message = f"已成功转换文件编码: {file_path} (从 {src_encoding} 转到 {dest_encoding})"
- log_message_to_file(log_file_path, log_message)
- except Exception as e:
- error_message = f"无法处理文件 {file_path}: {e}"
- log_message_to_file(log_file_path, error_message)
- # 示例调用
- directory = "./全书" # 替换为你的目标文件夹路径
- dest_encoding = "utf-8"
- extensions = ['.txt', '.html', '.htm', '.js', '.css'] # 需要处理的文件扩展名
- convert_encoding_in_dir(directory, dest_encoding, extensions)
|