123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- from pathlib import Path
- from bs4 import BeautifulSoup
- import logging
- from concurrent.futures import ThreadPoolExecutor
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s'
- )
- logger = logging.getLogger(__name__)
- class HTMLProcessor:
- """HTML文件处理器类"""
-
- def __init__(self, max_workers: int = 4):
- self.max_workers = max_workers
-
- """ 过滤冗余文本 """
- def _filter_redundant_text(self, file_path: str) -> str:
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
- content = content.replace('\n</p>', '</p>')
- content = content.replace('</body>', '\nb</body>')
- content = content.replace('<h2', '\n<h2')
- for text in content.split('\n'):
- if text.find('p34"') == 0 :
- content = content.replace(text, '<p class="'+text)
- if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
- content = content.replace(text, '')
- if text.strip() == '':
- content = content.replace(text, '')
- if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
- content = content.replace(text, '')
- content = content.replace('<p', '\n\n<p')
- # 写回文件
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
- # content = content.replace('\n</p>', '</p>')
- # return content
-
- def _clean_soup(self, soup: BeautifulSoup) -> None:
- """清理BeautifulSoup对象中的冗余内容
-
- 该方法会清理以下内容:
- 1. 移除所有HTML注释
- 2. 移除直接位于body或html标签下的纯文本节点
- 3. 保留所有被标签包裹的文本内容
-
- Args:
- soup: BeautifulSoup对象
- """
- soup.html.attrs['xml:lang'] = 'zh-CN'
- for element in soup.body.children:
- print("\n\n<<<===========================================\n")
- print(element, element.name);
- if element.name == None:
- # element.extract() 原来的地方使用 换行符号
- element.replace_with('\n')
- print("===========================================>>>\n\n")
-
- def process_file(self, file_path: Path) -> bool:
- """处理单个HTML文件"""
- try:
- self._filter_redundant_text(file_path);
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
-
- # 解析HTML
- soup = BeautifulSoup(content, 'html.parser')
-
- # 清理soup对象
- self._clean_soup(soup)
- result = str(soup)
-
- # 写回文件
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(result)
-
- logger.info(f"成功处理: {file_path}")
- return True
-
- except Exception as e:
- logger.error(f"处理失败 {file_path}: {str(e)}")
- return False
-
- def process_directory(self, directory: str) -> None:
- """处理目录中的所有HTML文件"""
- directory_path = Path(directory)
- if not directory_path.is_dir():
- logger.error(f"错误: 目录不存在 - {directory}")
- return
-
- html_files = list(directory_path.rglob("*.html"))
- if not html_files:
- logger.warning(f"未在目录中找到HTML文件: {directory}")
- return
-
- success_count = 0
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
- results = list(executor.map(self.process_file, html_files))
- success_count = sum(1 for result in results if result)
-
- logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
- def main():
- """主函数"""
- # import sys
- # if len(sys.argv) > 1:
- # target_directory = sys.argv[1]
- # else:
- # target_directory = input("请输入要处理的目录路径: ")
- target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
- processor = HTMLProcessor()
- processor.process_directory(target_directory)
- if __name__ == "__main__":
- main()
|