from pathlib import Path from bs4 import BeautifulSoup import logging from concurrent.futures import ThreadPoolExecutor # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class HTMLProcessor: """HTML文件处理器类""" def __init__(self, max_workers: int = 4): self.max_workers = max_workers """ 过滤冗余文本 """ def _filter_redundant_text(self, file_path: str) -> str: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() content = content.replace('\n

', '

') content = content.replace('', '\nb') content = content.replace('>>\n\n") def process_file(self, file_path: Path) -> bool: """处理单个HTML文件""" try: self._filter_redundant_text(file_path); with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 解析HTML soup = BeautifulSoup(content, 'html.parser') # 清理soup对象 self._clean_soup(soup) result = str(soup) # 写回文件 with open(file_path, 'w', encoding='utf-8') as f: f.write(result) logger.info(f"成功处理: {file_path}") return True except Exception as e: logger.error(f"处理失败 {file_path}: {str(e)}") return False def process_directory(self, directory: str) -> None: """处理目录中的所有HTML文件""" directory_path = Path(directory) if not directory_path.is_dir(): logger.error(f"错误: 目录不存在 - {directory}") return html_files = list(directory_path.rglob("*.html")) if not html_files: logger.warning(f"未在目录中找到HTML文件: {directory}") return success_count = 0 with ThreadPoolExecutor(max_workers=self.max_workers) as executor: results = list(executor.map(self.process_file, html_files)) success_count = sum(1 for result in results if result) logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件") def main(): """主函数""" # import sys # if len(sys.argv) > 1: # target_directory = sys.argv[1] # else: # target_directory = input("请输入要处理的目录路径: ") target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops" processor = HTMLProcessor() processor.process_directory(target_directory) if __name__ == "__main__": main()