1 month ago · c524891fe7
--- a/.DS_Store
+++ b/.DS_Store
--- a/code/process_html_file_v2.py
+++ b/code/process_html_file_v2.py
@@ -0,0 +1,122 @@
 
				+
			
 
				+from pathlib import Path
			
 
				+from bs4 import BeautifulSoup
			
 
				+import logging
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+# 配置日志
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format='%(asctime)s - %(levelname)s - %(message)s'
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+class HTMLProcessor:
			
 
				+    """HTML文件处理器类"""
			
 
				+    
			
 
				+    def __init__(self, max_workers: int = 4):
			
 
				+        self.max_workers = max_workers
			
 
				+    
			
 
				+    """ 过滤冗余文本 """
			
 
				+    def _filter_redundant_text(self, file_path: str) -> str:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                content = f.read()
			
 
				+        content = content.replace('\n</p>', '</p>')
			
 
				+        content = content.replace('</body>', '\n</body>')
			
 
				+        content = content.replace('<h2', '\n<h2')
			
 
				+        content = content.replace('\n\n', '\n')
			
 
				+        content = content.replace('\n\n', '\n')
			
 
				+        content = content.replace('\n\n', '\n')
			
 
				+        for text in content.split('\n'):
			
 
				+            if text.find('p34"') == 0 :
			
 
				+                content = content.replace(text, '<p class="'+text)
			
 
				+            if text.strip() == '' or text.find('<') != 0:
			
 
				+                content = content.replace(text, '')
			
 
				+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
			
 
				+                content = content.replace(text, '')
			
 
				+            
			
 
				+        content = content.replace('\n\n', '\n')
			
 
				+        content = content.replace('\n\n', '\n')
			
 
				+        content = content.replace('\n\n', '\n')
			
 
				+        # 写回文件
			
 
				+        with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+            f.write(content)
			
 
				+        # content = content.replace('\n</p>', '</p>')
			
 
				+        # return content
			
 
				+    
			
 
				+    def _clean_soup(self, soup: BeautifulSoup) -> None:
			
 
				+        """清理BeautifulSoup对象中的冗余内容
			
 
				+        
			
 
				+        该方法会清理以下内容：
			
 
				+        1. 移除所有HTML注释
			
 
				+        2. 移除直接位于body或html标签下的纯文本节点
			
 
				+        3. 保留所有被标签包裹的文本内容
			
 
				+        
			
 
				+        Args:
			
 
				+            soup: BeautifulSoup对象
			
 
				+        """
			
 
				+        soup.html.attrs['xml:lang'] = 'zh-CN'
			
 
				+        for element in soup.body.children:
			
 
				+            print("\n\n<<<===========================================\n")
			
 
				+            print(element, element.name);
			
 
				+            if element.name == None:
			
 
				+                # element.extract() 原来的地方使用 换行符号
			
 
				+                element.replace_with('\n')
			
 
				+            print("===========================================>>>\n\n")
			
 
				+      
			
 
				+    def process_file(self, file_path: Path) -> bool:
			
 
				+        """处理单个HTML文件"""
			
 
				+        try:
			
 
				+            self._filter_redundant_text(file_path);
			
 
				+            # with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            #     content = f.read()
			
 
				+            
			
 
				+            # # 解析HTML
			
 
				+            # soup = BeautifulSoup(content, 'html.parser')
			
 
				+            
			
 
				+            # # 清理soup对象
			
 
				+            # self._clean_soup(soup)
			
 
				+            # result = str(soup)
			
 
				+            
			
 
				+            # # 写回文件
			
 
				+            # with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+            #     f.write(result)
			
 
				+            
			
 
				+            logger.info(f"成功处理: {file_path}")
			
 
				+            return True
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"处理失败 {file_path}: {str(e)}")
			
 
				+            return False
			
 
				+    
			
 
				+    def process_directory(self, directory: str) -> None:
			
 
				+        """处理目录中的所有HTML文件"""
			
 
				+        directory_path = Path(directory)
			
 
				+        if not directory_path.is_dir():
			
 
				+            logger.error(f"错误: 目录不存在 - {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        html_files = list(directory_path.rglob("*.html"))
			
 
				+        if not html_files:
			
 
				+            logger.warning(f"未在目录中找到HTML文件: {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        success_count = 0
			
 
				+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
			
 
				+            results = list(executor.map(self.process_file, html_files))
			
 
				+            success_count = sum(1 for result in results if result)
			
 
				+        
			
 
				+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # import sys
			
 
				+    # if len(sys.argv) > 1:
			
 
				+    #     target_directory = sys.argv[1]
			
 
				+    # else:
			
 
				+    #     target_directory = input("请输入要处理的目录路径: ")
			
 
				+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
			
 
				+    processor = HTMLProcessor()
			
 
				+    processor.process_directory(target_directory)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/code/translate_epub_v4(单线程版本)V3.py
+++ b/code/translate_epub_v4(单线程版本)V3.py
@@ -563,7 +563,7 @@ def process_html_file(file_path, conn):
 
				 
			
 
				 def main():
			
 
				     ops_dir = config.get('paths', 'input_dir')
			
 
				-    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
			
 
				+    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.htm') or f.endswith('.html')]
			
 
				     
			
 
				     # 按文件名排序
			
 
				     html_files.sort()
			
--- a/code/归档/translate_epub_v1.py
+++ b/code/归档/translate_epub_v1.py
--- a/code/归档/translate_epub_v2.py
+++ b/code/归档/translate_epub_v2.py
--- a/code/归档/translate_epub_v3.py
+++ b/code/归档/translate_epub_v3.py
--- a/code/归档/translate_epub_v4(单线程版本).py
+++ b/code/归档/translate_epub_v4(单线程版本).py
--- a/code/归档/translate_epub_v4(单线程版本)V2.py
+++ b/code/归档/translate_epub_v4(单线程版本)V2.py
--- a/code/归档/translate_epub_v5(多线程版本).py
+++ b/code/归档/translate_epub_v5(多线程版本).py