123456789101112131415161718192021222324252627282930313233 |
- import os
- from bs4 import BeautifulSoup
- def clean_html_file(file_path):
- with open(file_path, 'r', encoding='utf-8') as file:
- content = file.read()
- # 使用 BeautifulSoup 解析 HTML 内容
- soup = BeautifulSoup(content, 'html.parser')
- # 移除所有空行
- for element in soup.find_all(text=True):
- if element.isspace(): # 移除纯空白的文本节点
- element.extract()
- # 为了确保保留被标签包裹的内容,我们只移除非标签包裹的文本
- for element in soup.find_all(text=True):
- if element.strip() == "" and element.parent.name not in ['script', 'style']:
- element.extract()
- # 将处理后的内容写回文件
- with open(file_path, 'w', encoding='utf-8') as file:
- file.write(str(soup))
- def process_directory(directory):
- for filename in os.listdir(directory):
- if filename.endswith('.html'):
- file_path = os.path.join(directory, filename)
- clean_html_file(file_path)
- # 指定要处理的目录路径
- directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops'
- process_directory(directory_path)
|