from pathlib import Path from bs4 import BeautifulSoup import logging from concurrent.futures import ThreadPoolExecutor import ast from lxml import etree # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class HTMLProcessor: """HTML文件处理器类""" def __init__(self, max_workers: int = 4): self.max_workers = max_workers """ 过滤冗余文本 """ def _filter_redundant_text(self, file_path: str) -> str: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() content = content.replace('\n

', '

') content = content.replace('', '\nb') content = content.replace('>>\n\n") def process_file(self, file_path: Path) -> bool: """处理单个HTML文件""" try: self.ast_html(file_path); # self._filter_redundant_text(file_path); # with open(file_path, 'r', encoding='utf-8') as f: # content = f.read() # # 解析HTML # soup = BeautifulSoup(content, 'html.parser') # # 清理soup对象 # self._clean_soup(soup) # result = str(soup) # # 写回文件 # with open(file_path, 'w', encoding='utf-8') as f: # f.write(result) logger.info(f"成功处理: {file_path}") return True except Exception as e: logger.error(f"处理失败 {file_path}: {str(e)}") return False def ast_html(self, file_path: Path) -> None: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 转换为字节 html_bytes = content.encode('utf-8') # 明确指定编码 tree = etree.HTML(html_bytes) body = tree.xpath('//body')[0] print(body) for element in body.iter(): print(element.tag, element.text) # print(ast.literal_eval(content)) # print(ast.literal_eval(content)) # soup = BeautifulSoup(content, 'html.parser') # return str(soup) def process_directory(self, directory: str) -> None: """处理目录中的所有HTML文件""" directory_path = Path(directory) if not directory_path.is_dir(): logger.error(f"错误: 目录不存在 - {directory}") return html_files = list(directory_path.rglob("*.html")) if not html_files: logger.warning(f"未在目录中找到HTML文件: {directory}") return success_count = 0 with ThreadPoolExecutor(max_workers=self.max_workers) as executor: results = list(executor.map(self.process_file, html_files)) success_count = sum(1 for result in results if result) logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件") def main(): """主函数""" # import sys # if len(sys.argv) > 1: # target_directory = sys.argv[1] # else: # target_directory = input("请输入要处理的目录路径: ") target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops" processor = HTMLProcessor() processor.process_directory(target_directory) if __name__ == "__main__": main()