|
@@ -0,0 +1,122 @@
|
|
|
+
|
|
|
+from pathlib import Path
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import logging
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
+# 配置日志
|
|
|
+logging.basicConfig(
|
|
|
+ level=logging.INFO,
|
|
|
+ format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
+)
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+class HTMLProcessor:
|
|
|
+ """HTML文件处理器类"""
|
|
|
+
|
|
|
+ def __init__(self, max_workers: int = 4):
|
|
|
+ self.max_workers = max_workers
|
|
|
+
|
|
|
+ """ 过滤冗余文本 """
|
|
|
+ def _filter_redundant_text(self, file_path: str) -> str:
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
+ content = f.read()
|
|
|
+ content = content.replace('\n</p>', '</p>')
|
|
|
+ content = content.replace('</body>', '\n</body>')
|
|
|
+ content = content.replace('<h2', '\n<h2')
|
|
|
+ content = content.replace('\n\n', '\n')
|
|
|
+ content = content.replace('\n\n', '\n')
|
|
|
+ content = content.replace('\n\n', '\n')
|
|
|
+ for text in content.split('\n'):
|
|
|
+ if text.find('p34"') == 0 :
|
|
|
+ content = content.replace(text, '<p class="'+text)
|
|
|
+ if text.strip() == '' or text.find('<') != 0:
|
|
|
+ content = content.replace(text, '')
|
|
|
+ if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
|
|
|
+ content = content.replace(text, '')
|
|
|
+
|
|
|
+ content = content.replace('\n\n', '\n')
|
|
|
+ content = content.replace('\n\n', '\n')
|
|
|
+ content = content.replace('\n\n', '\n')
|
|
|
+ # 写回文件
|
|
|
+ with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(content)
|
|
|
+ # content = content.replace('\n</p>', '</p>')
|
|
|
+ # return content
|
|
|
+
|
|
|
+ def _clean_soup(self, soup: BeautifulSoup) -> None:
|
|
|
+ """清理BeautifulSoup对象中的冗余内容
|
|
|
+
|
|
|
+ 该方法会清理以下内容:
|
|
|
+ 1. 移除所有HTML注释
|
|
|
+ 2. 移除直接位于body或html标签下的纯文本节点
|
|
|
+ 3. 保留所有被标签包裹的文本内容
|
|
|
+
|
|
|
+ Args:
|
|
|
+ soup: BeautifulSoup对象
|
|
|
+ """
|
|
|
+ soup.html.attrs['xml:lang'] = 'zh-CN'
|
|
|
+ for element in soup.body.children:
|
|
|
+ print("\n\n<<<===========================================\n")
|
|
|
+ print(element, element.name);
|
|
|
+ if element.name == None:
|
|
|
+ # element.extract() 原来的地方使用 换行符号
|
|
|
+ element.replace_with('\n')
|
|
|
+ print("===========================================>>>\n\n")
|
|
|
+
|
|
|
+ def process_file(self, file_path: Path) -> bool:
|
|
|
+ """处理单个HTML文件"""
|
|
|
+ try:
|
|
|
+ self._filter_redundant_text(file_path);
|
|
|
+ # with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
+ # content = f.read()
|
|
|
+
|
|
|
+ # # 解析HTML
|
|
|
+ # soup = BeautifulSoup(content, 'html.parser')
|
|
|
+
|
|
|
+ # # 清理soup对象
|
|
|
+ # self._clean_soup(soup)
|
|
|
+ # result = str(soup)
|
|
|
+
|
|
|
+ # # 写回文件
|
|
|
+ # with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
+ # f.write(result)
|
|
|
+
|
|
|
+ logger.info(f"成功处理: {file_path}")
|
|
|
+ return True
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"处理失败 {file_path}: {str(e)}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ def process_directory(self, directory: str) -> None:
|
|
|
+ """处理目录中的所有HTML文件"""
|
|
|
+ directory_path = Path(directory)
|
|
|
+ if not directory_path.is_dir():
|
|
|
+ logger.error(f"错误: 目录不存在 - {directory}")
|
|
|
+ return
|
|
|
+
|
|
|
+ html_files = list(directory_path.rglob("*.html"))
|
|
|
+ if not html_files:
|
|
|
+ logger.warning(f"未在目录中找到HTML文件: {directory}")
|
|
|
+ return
|
|
|
+
|
|
|
+ success_count = 0
|
|
|
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
|
+ results = list(executor.map(self.process_file, html_files))
|
|
|
+ success_count = sum(1 for result in results if result)
|
|
|
+
|
|
|
+ logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
|
|
|
+
|
|
|
+def main():
|
|
|
+ """主函数"""
|
|
|
+ # import sys
|
|
|
+ # if len(sys.argv) > 1:
|
|
|
+ # target_directory = sys.argv[1]
|
|
|
+ # else:
|
|
|
+ # target_directory = input("请输入要处理的目录路径: ")
|
|
|
+ target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
|
|
|
+ processor = HTMLProcessor()
|
|
|
+ processor.process_directory(target_directory)
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|