process_html_file_v3.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. from pathlib import Path
  2. from bs4 import BeautifulSoup
  3. import logging
  4. from concurrent.futures import ThreadPoolExecutor
  5. import ast
  6. from lxml import etree
  7. # 配置日志
  8. logging.basicConfig(
  9. level=logging.INFO,
  10. format='%(asctime)s - %(levelname)s - %(message)s'
  11. )
  12. logger = logging.getLogger(__name__)
  13. class HTMLProcessor:
  14. """HTML文件处理器类"""
  15. def __init__(self, max_workers: int = 4):
  16. self.max_workers = max_workers
  17. """ 过滤冗余文本 """
  18. def _filter_redundant_text(self, file_path: str) -> str:
  19. with open(file_path, 'r', encoding='utf-8') as f:
  20. content = f.read()
  21. content = content.replace('\n</p>', '</p>')
  22. content = content.replace('</body>', '\nb</body>')
  23. content = content.replace('<h2', '\n<h2')
  24. for text in content.split('\n'):
  25. if text.find('p34"') == 0 :
  26. content = content.replace(text, '<p class="'+text)
  27. if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
  28. content = content.replace(text, '')
  29. if text.strip() == '':
  30. content = content.replace(text, '')
  31. if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
  32. content = content.replace(text, '')
  33. content = content.replace('<p', '\n\n<p')
  34. # 写回文件
  35. with open(file_path, 'w', encoding='utf-8') as f:
  36. f.write(content)
  37. # content = content.replace('\n</p>', '</p>')
  38. # return content
  39. def _clean_soup(self, soup: BeautifulSoup) -> None:
  40. """清理BeautifulSoup对象中的冗余内容
  41. 该方法会清理以下内容:
  42. 1. 移除所有HTML注释
  43. 2. 移除直接位于body或html标签下的纯文本节点
  44. 3. 保留所有被标签包裹的文本内容
  45. Args:
  46. soup: BeautifulSoup对象
  47. """
  48. soup.html.attrs['xml:lang'] = 'zh-CN'
  49. for element in soup.body.children:
  50. print("\n\n<<<===========================================\n")
  51. print(element, element.name);
  52. if element.name == None:
  53. # element.extract() 原来的地方使用 换行符号
  54. element.replace_with('\n')
  55. print("===========================================>>>\n\n")
  56. def process_file(self, file_path: Path) -> bool:
  57. """处理单个HTML文件"""
  58. try:
  59. self.ast_html(file_path);
  60. # self._filter_redundant_text(file_path);
  61. # with open(file_path, 'r', encoding='utf-8') as f:
  62. # content = f.read()
  63. # # 解析HTML
  64. # soup = BeautifulSoup(content, 'html.parser')
  65. # # 清理soup对象
  66. # self._clean_soup(soup)
  67. # result = str(soup)
  68. # # 写回文件
  69. # with open(file_path, 'w', encoding='utf-8') as f:
  70. # f.write(result)
  71. logger.info(f"成功处理: {file_path}")
  72. return True
  73. except Exception as e:
  74. logger.error(f"处理失败 {file_path}: {str(e)}")
  75. return False
  76. def ast_html(self, file_path: Path) -> None:
  77. with open(file_path, 'r', encoding='utf-8') as f:
  78. content = f.read()
  79. # 转换为字节
  80. html_bytes = content.encode('utf-8') # 明确指定编码
  81. tree = etree.HTML(html_bytes)
  82. body = tree.xpath('//body')[0]
  83. print(body)
  84. for element in body.iter():
  85. print(element.tag, element.text)
  86. # print(ast.literal_eval(content))
  87. # print(ast.literal_eval(content))
  88. # soup = BeautifulSoup(content, 'html.parser')
  89. # return str(soup)
  90. def process_directory(self, directory: str) -> None:
  91. """处理目录中的所有HTML文件"""
  92. directory_path = Path(directory)
  93. if not directory_path.is_dir():
  94. logger.error(f"错误: 目录不存在 - {directory}")
  95. return
  96. html_files = list(directory_path.rglob("*.html"))
  97. if not html_files:
  98. logger.warning(f"未在目录中找到HTML文件: {directory}")
  99. return
  100. success_count = 0
  101. with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
  102. results = list(executor.map(self.process_file, html_files))
  103. success_count = sum(1 for result in results if result)
  104. logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
  105. def main():
  106. """主函数"""
  107. # import sys
  108. # if len(sys.argv) > 1:
  109. # target_directory = sys.argv[1]
  110. # else:
  111. # target_directory = input("请输入要处理的目录路径: ")
  112. target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
  113. processor = HTMLProcessor()
  114. processor.process_directory(target_directory)
  115. if __name__ == "__main__":
  116. main()