process_html_file_v2.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. from pathlib import Path
  2. from bs4 import BeautifulSoup
  3. import logging
  4. from concurrent.futures import ThreadPoolExecutor
  5. # 配置日志
  6. logging.basicConfig(
  7. level=logging.INFO,
  8. format='%(asctime)s - %(levelname)s - %(message)s'
  9. )
  10. logger = logging.getLogger(__name__)
  11. class HTMLProcessor:
  12. """HTML文件处理器类"""
  13. def __init__(self, max_workers: int = 4):
  14. self.max_workers = max_workers
  15. """ 过滤冗余文本 """
  16. def _filter_redundant_text(self, file_path: str) -> str:
  17. with open(file_path, 'r', encoding='utf-8') as f:
  18. content = f.read()
  19. content = content.replace('\n</p>', '</p>')
  20. content = content.replace('</body>', '\n</body>')
  21. content = content.replace('<h2', '\n<h2')
  22. content = content.replace('\n\n', '\n')
  23. content = content.replace('\n\n', '\n')
  24. content = content.replace('\n\n', '\n')
  25. for text in content.split('\n'):
  26. if text.find('p34"') == 0 :
  27. content = content.replace(text, '<p class="'+text)
  28. if text.strip() == '' or text.find('<') != 0:
  29. content = content.replace(text, '')
  30. if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
  31. content = content.replace(text, '')
  32. content = content.replace('\n\n', '\n')
  33. content = content.replace('\n\n', '\n')
  34. content = content.replace('\n\n', '\n')
  35. # 写回文件
  36. with open(file_path, 'w', encoding='utf-8') as f:
  37. f.write(content)
  38. # content = content.replace('\n</p>', '</p>')
  39. # return content
  40. def _clean_soup(self, soup: BeautifulSoup) -> None:
  41. """清理BeautifulSoup对象中的冗余内容
  42. 该方法会清理以下内容:
  43. 1. 移除所有HTML注释
  44. 2. 移除直接位于body或html标签下的纯文本节点
  45. 3. 保留所有被标签包裹的文本内容
  46. Args:
  47. soup: BeautifulSoup对象
  48. """
  49. soup.html.attrs['xml:lang'] = 'zh-CN'
  50. for element in soup.body.children:
  51. print("\n\n<<<===========================================\n")
  52. print(element, element.name);
  53. if element.name == None:
  54. # element.extract() 原来的地方使用 换行符号
  55. element.replace_with('\n')
  56. print("===========================================>>>\n\n")
  57. def process_file(self, file_path: Path) -> bool:
  58. """处理单个HTML文件"""
  59. try:
  60. self._filter_redundant_text(file_path);
  61. # with open(file_path, 'r', encoding='utf-8') as f:
  62. # content = f.read()
  63. # # 解析HTML
  64. # soup = BeautifulSoup(content, 'html.parser')
  65. # # 清理soup对象
  66. # self._clean_soup(soup)
  67. # result = str(soup)
  68. # # 写回文件
  69. # with open(file_path, 'w', encoding='utf-8') as f:
  70. # f.write(result)
  71. logger.info(f"成功处理: {file_path}")
  72. return True
  73. except Exception as e:
  74. logger.error(f"处理失败 {file_path}: {str(e)}")
  75. return False
  76. def process_directory(self, directory: str) -> None:
  77. """处理目录中的所有HTML文件"""
  78. directory_path = Path(directory)
  79. if not directory_path.is_dir():
  80. logger.error(f"错误: 目录不存在 - {directory}")
  81. return
  82. html_files = list(directory_path.rglob("*.html"))
  83. if not html_files:
  84. logger.warning(f"未在目录中找到HTML文件: {directory}")
  85. return
  86. success_count = 0
  87. with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
  88. results = list(executor.map(self.process_file, html_files))
  89. success_count = sum(1 for result in results if result)
  90. logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
  91. def main():
  92. """主函数"""
  93. # import sys
  94. # if len(sys.argv) > 1:
  95. # target_directory = sys.argv[1]
  96. # else:
  97. # target_directory = input("请输入要处理的目录路径: ")
  98. target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
  99. processor = HTMLProcessor()
  100. processor.process_directory(target_directory)
  101. if __name__ == "__main__":
  102. main()