import os from bs4 import BeautifulSoup def clean_html_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # 使用 BeautifulSoup 解析 HTML 内容 soup = BeautifulSoup(content, 'html.parser') # 移除所有空行 for element in soup.find_all(text=True): if element.isspace(): # 移除纯空白的文本节点 element.extract() # 为了确保保留被标签包裹的内容,我们只移除非标签包裹的文本 for element in soup.find_all(text=True): if element.strip() == "" and element.parent.name not in ['script', 'style']: element.extract() # 将处理后的内容写回文件 with open(file_path, 'w', encoding='utf-8') as file: file.write(str(soup)) def process_directory(directory): for filename in os.listdir(directory): if filename.endswith('.html'): file_path = os.path.join(directory, filename) clean_html_file(file_path) # 指定要处理的目录路径 directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops' process_directory(directory_path)