process_html_file.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. import os
  2. import re
  3. from typing import Optional, List
  4. from pathlib import Path
  5. from bs4 import BeautifulSoup, Doctype, Comment
  6. import logging
  7. from concurrent.futures import ThreadPoolExecutor
  8. from functools import partial
  9. # 配置日志
  10. logging.basicConfig(
  11. level=logging.INFO,
  12. format='%(asctime)s - %(levelname)s - %(message)s'
  13. )
  14. logger = logging.getLogger(__name__)
  15. class HTMLProcessor:
  16. """HTML文件处理器类"""
  17. def __init__(self, max_workers: int = 4):
  18. self.max_workers = max_workers
  19. def _is_explanation_text(self, text: str) -> bool:
  20. """判断文本是否为说明文本"""
  21. text = text.strip()
  22. explanation_patterns = [
  23. r'^(说明:.*?)$',
  24. r'^说明:.*?$',
  25. r'^(注:.*?)$',
  26. r'^注:.*?$',
  27. r'^(.*?)$', # 处理括号内的说明文本
  28. r'^[0-9]+\.\s.*?$', # 处理编号说明
  29. r'^.*?的同时,.*?$', # 处理"的同时"类型的说明
  30. ]
  31. return any(re.match(pattern, text, re.DOTALL) for pattern in explanation_patterns)
  32. def _clean_content(self, content: str) -> str:
  33. """清理HTML内容中的多余空行和注释"""
  34. # 移除HTML注释
  35. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  36. # 移除说明文本
  37. content = re.sub(r'(说明:.*?)', '', content, flags=re.DOTALL)
  38. content = re.sub(r'说明:.*?$', '', content, flags=re.MULTILINE)
  39. content = re.sub(r'(注:.*?)', '', content, flags=re.DOTALL)
  40. content = re.sub(r'注:.*?$', '', content, flags=re.MULTILINE)
  41. # 移除空行
  42. content = re.sub(r'\n\s*\n', '\n', content).strip()
  43. return content
  44. def _clean_soup(self, soup: BeautifulSoup) -> None:
  45. """清理BeautifulSoup对象中的冗余内容"""
  46. # 移除所有注释
  47. for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
  48. comment.extract()
  49. # 移除说明文本
  50. for text in soup.find_all(text=True):
  51. if isinstance(text, str) and self._is_explanation_text(text):
  52. text.extract()
  53. # 清理空的标签
  54. for tag in soup.find_all():
  55. if not tag.get_text(strip=True) and not tag.attrs:
  56. tag.extract()
  57. def _ensure_proper_structure(self, soup: BeautifulSoup) -> BeautifulSoup:
  58. """确保HTML结构正确"""
  59. # 创建新的soup对象
  60. new_soup = BeautifulSoup('', 'html.parser')
  61. # 处理DOCTYPE
  62. doctype = None
  63. for item in soup.contents:
  64. if isinstance(item, Doctype):
  65. doctype = item
  66. break
  67. if doctype:
  68. new_soup.append(doctype)
  69. # 创建html标签
  70. html_tag = soup.new_tag('html')
  71. new_soup.append(html_tag)
  72. # 处理head标签
  73. head_tag = soup.find('head')
  74. if head_tag:
  75. html_tag.append(head_tag)
  76. else:
  77. head_tag = soup.new_tag('head')
  78. html_tag.append(head_tag)
  79. # 处理body标签
  80. body_tag = soup.find('body')
  81. if body_tag:
  82. html_tag.append(body_tag)
  83. else:
  84. body_tag = soup.new_tag('body')
  85. html_tag.append(body_tag)
  86. # 将内容移动到body中
  87. for element in soup.find_all(recursive=False):
  88. if element.name not in ['html', 'head', 'body']:
  89. element.extract()
  90. body_tag.append(element)
  91. return new_soup
  92. def _process_unwrapped_content(self, soup: BeautifulSoup) -> None:
  93. """处理未包裹的内容"""
  94. body_tag = soup.body
  95. if not body_tag:
  96. if not soup.html:
  97. soup = BeautifulSoup('<html><head></head><body></body></html>', 'html.parser')
  98. else:
  99. soup.html.append(soup.new_tag('body'))
  100. body_tag = soup.body
  101. for element in list(soup.html.contents):
  102. if element.name != 'body' and element != body_tag:
  103. if element.name:
  104. element.extract()
  105. body_tag.append(element)
  106. elif str(element).strip():
  107. text = str(element).strip()
  108. if not self._is_explanation_text(text):
  109. new_p = soup.new_tag('p')
  110. new_p.string = text
  111. element.replace_with(new_p)
  112. body_tag.append(new_p)
  113. else:
  114. element.extract()
  115. def process_file(self, file_path: Path) -> bool:
  116. """处理单个HTML文件"""
  117. try:
  118. with open(file_path, 'r', encoding='utf-8') as f:
  119. content = f.read()
  120. # 清理内容
  121. content = self._clean_content(content)
  122. # 解析HTML
  123. soup = BeautifulSoup(content, 'html.parser')
  124. # 清理soup对象
  125. self._clean_soup(soup)
  126. # 确保HTML结构正确
  127. soup = self._ensure_proper_structure(soup)
  128. # 处理未包裹内容
  129. self._process_unwrapped_content(soup)
  130. # 输出结果
  131. result = self._clean_content(str(soup))
  132. # 写回文件
  133. with open(file_path, 'w', encoding='utf-8') as f:
  134. f.write(result)
  135. logger.info(f"成功处理: {file_path}")
  136. return True
  137. except Exception as e:
  138. logger.error(f"处理失败 {file_path}: {str(e)}")
  139. return False
  140. def process_directory(self, directory: str) -> None:
  141. """处理目录中的所有HTML文件"""
  142. directory_path = Path(directory)
  143. if not directory_path.is_dir():
  144. logger.error(f"错误: 目录不存在 - {directory}")
  145. return
  146. html_files = list(directory_path.rglob("*.html"))
  147. if not html_files:
  148. logger.warning(f"未在目录中找到HTML文件: {directory}")
  149. return
  150. success_count = 0
  151. with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
  152. results = list(executor.map(self.process_file, html_files))
  153. success_count = sum(1 for result in results if result)
  154. logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
  155. def main():
  156. """主函数"""
  157. import sys
  158. if len(sys.argv) > 1:
  159. target_directory = sys.argv[1]
  160. else:
  161. target_directory = input("请输入要处理的目录路径: ")
  162. processor = HTMLProcessor()
  163. processor.process_directory(target_directory)
  164. if __name__ == "__main__":
  165. main()