1 month ago · c524891fe7
--- a/.DS_Store
+++ b/.DS_Store
--- a/code/process_html_file_v2.py
+++ b/code/process_html_file_v2.py
@@ -0,0 +1,122 @@
 
															+
														
 
															+from pathlib import Path
														
 
															+from bs4 import BeautifulSoup
														
 
															+import logging
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+# 配置日志
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format='%(asctime)s - %(levelname)s - %(message)s'
														
 
															+)
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+class HTMLProcessor:
														
 
															+    """HTML文件处理器类"""
														
 
															+    
														
 
															+    def __init__(self, max_workers: int = 4):
														
 
															+        self.max_workers = max_workers
														
 
															+    
														
 
															+    """ 过滤冗余文本 """
														
 
															+    def _filter_redundant_text(self, file_path: str) -> str:
														
 
															+        with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+                content = f.read()
														
 
															+        content = content.replace('\n</p>', '</p>')
														
 
															+        content = content.replace('</body>', '\n</body>')
														
 
															+        content = content.replace('<h2', '\n<h2')
														
 
															+        content = content.replace('\n\n', '\n')
														
 
															+        content = content.replace('\n\n', '\n')
														
 
															+        content = content.replace('\n\n', '\n')
														
 
															+        for text in content.split('\n'):
														
 
															+            if text.find('p34"') == 0 :
														
 
															+                content = content.replace(text, '<p class="'+text)
														
 
															+            if text.strip() == '' or text.find('<') != 0:
														
 
															+                content = content.replace(text, '')
														
 
															+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
														
 
															+                content = content.replace(text, '')
														
 
															+            
														
 
															+        content = content.replace('\n\n', '\n')
														
 
															+        content = content.replace('\n\n', '\n')
														
 
															+        content = content.replace('\n\n', '\n')
														
 
															+        # 写回文件
														
 
															+        with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+            f.write(content)
														
 
															+        # content = content.replace('\n</p>', '</p>')
														
 
															+        # return content
														
 
															+    
														
 
															+    def _clean_soup(self, soup: BeautifulSoup) -> None:
														
 
															+        """清理BeautifulSoup对象中的冗余内容
														
 
															+        
														
 
															+        该方法会清理以下内容：
														
 
															+        1. 移除所有HTML注释
														
 
															+        2. 移除直接位于body或html标签下的纯文本节点
														
 
															+        3. 保留所有被标签包裹的文本内容
														
 
															+        
														
 
															+        Args:
														
 
															+            soup: BeautifulSoup对象
														
 
															+        """
														
 
															+        soup.html.attrs['xml:lang'] = 'zh-CN'
														
 
															+        for element in soup.body.children:
														
 
															+            print("\n\n<<<===========================================\n")
														
 
															+            print(element, element.name);
														
 
															+            if element.name == None:
														
 
															+                # element.extract() 原来的地方使用 换行符号
														
 
															+                element.replace_with('\n')
														
 
															+            print("===========================================>>>\n\n")
														
 
															+      
														
 
															+    def process_file(self, file_path: Path) -> bool:
														
 
															+        """处理单个HTML文件"""
														
 
															+        try:
														
 
															+            self._filter_redundant_text(file_path);
														
 
															+            # with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+            #     content = f.read()
														
 
															+            
														
 
															+            # # 解析HTML
														
 
															+            # soup = BeautifulSoup(content, 'html.parser')
														
 
															+            
														
 
															+            # # 清理soup对象
														
 
															+            # self._clean_soup(soup)
														
 
															+            # result = str(soup)
														
 
															+            
														
 
															+            # # 写回文件
														
 
															+            # with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+            #     f.write(result)
														
 
															+            
														
 
															+            logger.info(f"成功处理: {file_path}")
														
 
															+            return True
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"处理失败 {file_path}: {str(e)}")
														
 
															+            return False
														
 
															+    
														
 
															+    def process_directory(self, directory: str) -> None:
														
 
															+        """处理目录中的所有HTML文件"""
														
 
															+        directory_path = Path(directory)
														
 
															+        if not directory_path.is_dir():
														
 
															+            logger.error(f"错误: 目录不存在 - {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        html_files = list(directory_path.rglob("*.html"))
														
 
															+        if not html_files:
														
 
															+            logger.warning(f"未在目录中找到HTML文件: {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        success_count = 0
														
 
															+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
														
 
															+            results = list(executor.map(self.process_file, html_files))
														
 
															+            success_count = sum(1 for result in results if result)
														
 
															+        
														
 
															+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
														
 
															+
														
 
															+def main():
														
 
															+    """主函数"""
														
 
															+    # import sys
														
 
															+    # if len(sys.argv) > 1:
														
 
															+    #     target_directory = sys.argv[1]
														
 
															+    # else:
														
 
															+    #     target_directory = input("请输入要处理的目录路径: ")
														
 
															+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
														
 
															+    processor = HTMLProcessor()
														
 
															+    processor.process_directory(target_directory)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/code/translate_epub_v4(单线程版本)V3.py
+++ b/code/translate_epub_v4(单线程版本)V3.py
@@ -563,7 +563,7 @@ def process_html_file(file_path, conn):
 
															 def main():
														
 
															     ops_dir = config.get('paths', 'input_dir')
														
 
															-    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
														
 
															+    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.htm') or f.endswith('.html')]
														
 
															     # 按文件名排序
														
 
															     html_files.sort()
														
--- a/code/归档/translate_epub_v1.py
+++ b/code/归档/translate_epub_v1.py
--- a/code/归档/translate_epub_v2.py
+++ b/code/归档/translate_epub_v2.py
--- a/code/归档/translate_epub_v3.py
+++ b/code/归档/translate_epub_v3.py
--- a/code/归档/translate_epub_v4(单线程版本).py
+++ b/code/归档/translate_epub_v4(单线程版本).py
--- a/code/归档/translate_epub_v4(单线程版本)V2.py
+++ b/code/归档/translate_epub_v4(单线程版本)V2.py
--- a/code/归档/translate_epub_v5(多线程版本).py
+++ b/code/归档/translate_epub_v5(多线程版本).py