process_html_file_v2.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. from pathlib import Path
  2. from bs4 import BeautifulSoup
  3. import logging
  4. from concurrent.futures import ThreadPoolExecutor
  5. # 配置日志
  6. logging.basicConfig(
  7. level=logging.INFO,
  8. format='%(asctime)s - %(levelname)s - %(message)s'
  9. )
  10. logger = logging.getLogger(__name__)
  11. class HTMLProcessor:
  12. """HTML文件处理器类"""
  13. def __init__(self, max_workers: int = 4):
  14. self.max_workers = max_workers
  15. """ 过滤冗余文本 """
  16. def _filter_redundant_text(self, file_path: str) -> str:
  17. with open(file_path, 'r', encoding='utf-8') as f:
  18. content = f.read()
  19. content = content.replace('\n</p>', '</p>')
  20. content = content.replace('</body>', '\nb</body>')
  21. content = content.replace('<h2', '\n<h2')
  22. for text in content.split('\n'):
  23. if text.find('p34"') == 0 :
  24. content = content.replace(text, '<p class="'+text)
  25. if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
  26. content = content.replace(text, '')
  27. if text.strip() == '':
  28. content = content.replace(text, '')
  29. if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
  30. content = content.replace(text, '')
  31. content = content.replace('<p', '\n\n<p')
  32. # 写回文件
  33. with open(file_path, 'w', encoding='utf-8') as f:
  34. f.write(content)
  35. # content = content.replace('\n</p>', '</p>')
  36. # return content
  37. def _clean_soup(self, soup: BeautifulSoup) -> None:
  38. """清理BeautifulSoup对象中的冗余内容
  39. 该方法会清理以下内容:
  40. 1. 移除所有HTML注释
  41. 2. 移除直接位于body或html标签下的纯文本节点
  42. 3. 保留所有被标签包裹的文本内容
  43. Args:
  44. soup: BeautifulSoup对象
  45. """
  46. soup.html.attrs['xml:lang'] = 'zh-CN'
  47. for element in soup.body.children:
  48. print("\n\n<<<===========================================\n")
  49. print(element, element.name);
  50. if element.name == None:
  51. # element.extract() 原来的地方使用 换行符号
  52. element.replace_with('\n')
  53. print("===========================================>>>\n\n")
  54. def process_file(self, file_path: Path) -> bool:
  55. """处理单个HTML文件"""
  56. try:
  57. self._filter_redundant_text(file_path);
  58. with open(file_path, 'r', encoding='utf-8') as f:
  59. content = f.read()
  60. # 解析HTML
  61. soup = BeautifulSoup(content, 'html.parser')
  62. # 清理soup对象
  63. self._clean_soup(soup)
  64. result = str(soup)
  65. # 写回文件
  66. with open(file_path, 'w', encoding='utf-8') as f:
  67. f.write(result)
  68. logger.info(f"成功处理: {file_path}")
  69. return True
  70. except Exception as e:
  71. logger.error(f"处理失败 {file_path}: {str(e)}")
  72. return False
  73. def process_directory(self, directory: str) -> None:
  74. """处理目录中的所有HTML文件"""
  75. directory_path = Path(directory)
  76. if not directory_path.is_dir():
  77. logger.error(f"错误: 目录不存在 - {directory}")
  78. return
  79. html_files = list(directory_path.rglob("*.html"))
  80. if not html_files:
  81. logger.warning(f"未在目录中找到HTML文件: {directory}")
  82. return
  83. success_count = 0
  84. with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
  85. results = list(executor.map(self.process_file, html_files))
  86. success_count = sum(1 for result in results if result)
  87. logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
  88. def main():
  89. """主函数"""
  90. # import sys
  91. # if len(sys.argv) > 1:
  92. # target_directory = sys.argv[1]
  93. # else:
  94. # target_directory = input("请输入要处理的目录路径: ")
  95. target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
  96. processor = HTMLProcessor()
  97. processor.process_directory(target_directory)
  98. if __name__ == "__main__":
  99. main()