clean_html_file.py 1.1 KB

123456789101112131415161718192021222324252627282930313233
  1. import os
  2. from bs4 import BeautifulSoup
  3. def clean_html_file(file_path):
  4. with open(file_path, 'r', encoding='utf-8') as file:
  5. content = file.read()
  6. # 使用 BeautifulSoup 解析 HTML 内容
  7. soup = BeautifulSoup(content, 'html.parser')
  8. # 移除所有空行
  9. for element in soup.find_all(text=True):
  10. if element.isspace(): # 移除纯空白的文本节点
  11. element.extract()
  12. # 为了确保保留被标签包裹的内容,我们只移除非标签包裹的文本
  13. for element in soup.find_all(text=True):
  14. if element.strip() == "" and element.parent.name not in ['script', 'style']:
  15. element.extract()
  16. # 将处理后的内容写回文件
  17. with open(file_path, 'w', encoding='utf-8') as file:
  18. file.write(str(soup))
  19. def process_directory(directory):
  20. for filename in os.listdir(directory):
  21. if filename.endswith('.html'):
  22. file_path = os.path.join(directory, filename)
  23. clean_html_file(file_path)
  24. # 指定要处理的目录路径
  25. directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops'
  26. process_directory(directory_path)