>>\n\n") def process_file(self, file_path: Path) -> bool: """处理单个HTML文件""" try: self._filter_redundant_text(file_path); with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 解析HTML soup = BeautifulSoup(content, 'html.parser') # 清理soup对象 self._clean_soup(soup) result = str(soup) # 写回文件 with open(file_path, 'w', encoding='utf-8') as f: f.write(result) logger.info(f"成功处理: {file_path}") return True except Exception as e: logger.error(f"处理失败 {file_path}: {str(e)}") return False def process_directory(self, directory: str) -> None: """处理目录中的所有HTML文件""" directory_path = Path(directory) if not directory_path.is_dir(): logger.error(f"错误: 目录不存在 - {directory}") return html_files = list(directory_path.rglob("*.html")) if not html_files: logger.warning(f"未在目录中找到HTML文件: {directory}") return success_count = 0 with ThreadPoolExecutor(max_workers=self.max_workers) as executor: results = list(executor.map(self.process_file, html_files)) success_count = sum(1 for result in results if result) logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件") def main(): """主函数""" # import sys # if len(sys.argv) > 1: # target_directory = sys.argv[1] # else: # target_directory = input("请输入要处理的目录路径: ") target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops" processor = HTMLProcessor() processor.process_directory(target_directory) if __name__ == "__main__": main()