root
/
english-to-chinese


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233
							import os
from bs4 import BeautifulSoup

def clean_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # 使用 BeautifulSoup 解析 HTML 内容
    soup = BeautifulSoup(content, 'html.parser')

    # 移除所有空行
    for element in soup.find_all(text=True):
        if element.isspace():  # 移除纯空白的文本节点
            element.extract()

    # 为了确保保留被标签包裹的内容，我们只移除非标签包裹的文本
    for element in soup.find_all(text=True):
        if element.strip() == "" and element.parent.name not in ['script', 'style']:
            element.extract()

    # 将处理后的内容写回文件
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(str(soup))

def process_directory(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.html'):
            file_path = os.path.join(directory, filename)
            clean_html_file(file_path)

# 指定要处理的目录路径
directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops'
process_directory(directory_path)