max 1 month ago
parent
commit
c524891fe7

BIN
.DS_Store


+ 122 - 0
code/process_html_file_v2.py

@@ -0,0 +1,122 @@
+
+from pathlib import Path
+from bs4 import BeautifulSoup
+import logging
+from concurrent.futures import ThreadPoolExecutor
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class HTMLProcessor:
+    """HTML文件处理器类"""
+    
+    def __init__(self, max_workers: int = 4):
+        self.max_workers = max_workers
+    
+    """ 过滤冗余文本 """
+    def _filter_redundant_text(self, file_path: str) -> str:
+        with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+        content = content.replace('\n</p>', '</p>')
+        content = content.replace('</body>', '\n</body>')
+        content = content.replace('<h2', '\n<h2')
+        content = content.replace('\n\n', '\n')
+        content = content.replace('\n\n', '\n')
+        content = content.replace('\n\n', '\n')
+        for text in content.split('\n'):
+            if text.find('p34"') == 0 :
+                content = content.replace(text, '<p class="'+text)
+            if text.strip() == '' or text.find('<') != 0:
+                content = content.replace(text, '')
+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
+                content = content.replace(text, '')
+            
+        content = content.replace('\n\n', '\n')
+        content = content.replace('\n\n', '\n')
+        content = content.replace('\n\n', '\n')
+        # 写回文件
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        # content = content.replace('\n</p>', '</p>')
+        # return content
+    
+    def _clean_soup(self, soup: BeautifulSoup) -> None:
+        """清理BeautifulSoup对象中的冗余内容
+        
+        该方法会清理以下内容:
+        1. 移除所有HTML注释
+        2. 移除直接位于body或html标签下的纯文本节点
+        3. 保留所有被标签包裹的文本内容
+        
+        Args:
+            soup: BeautifulSoup对象
+        """
+        soup.html.attrs['xml:lang'] = 'zh-CN'
+        for element in soup.body.children:
+            print("\n\n<<<===========================================\n")
+            print(element, element.name);
+            if element.name == None:
+                # element.extract() 原来的地方使用 换行符号
+                element.replace_with('\n')
+            print("===========================================>>>\n\n")
+      
+    def process_file(self, file_path: Path) -> bool:
+        """处理单个HTML文件"""
+        try:
+            self._filter_redundant_text(file_path);
+            # with open(file_path, 'r', encoding='utf-8') as f:
+            #     content = f.read()
+            
+            # # 解析HTML
+            # soup = BeautifulSoup(content, 'html.parser')
+            
+            # # 清理soup对象
+            # self._clean_soup(soup)
+            # result = str(soup)
+            
+            # # 写回文件
+            # with open(file_path, 'w', encoding='utf-8') as f:
+            #     f.write(result)
+            
+            logger.info(f"成功处理: {file_path}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"处理失败 {file_path}: {str(e)}")
+            return False
+    
+    def process_directory(self, directory: str) -> None:
+        """处理目录中的所有HTML文件"""
+        directory_path = Path(directory)
+        if not directory_path.is_dir():
+            logger.error(f"错误: 目录不存在 - {directory}")
+            return
+        
+        html_files = list(directory_path.rglob("*.html"))
+        if not html_files:
+            logger.warning(f"未在目录中找到HTML文件: {directory}")
+            return
+        
+        success_count = 0
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = list(executor.map(self.process_file, html_files))
+            success_count = sum(1 for result in results if result)
+        
+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
+
+def main():
+    """主函数"""
+    # import sys
+    # if len(sys.argv) > 1:
+    #     target_directory = sys.argv[1]
+    # else:
+    #     target_directory = input("请输入要处理的目录路径: ")
+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
+    processor = HTMLProcessor()
+    processor.process_directory(target_directory)
+
+if __name__ == "__main__":
+    main()

+ 1 - 1
code/translate_epub_v4(单线程版本)V3.py

@@ -563,7 +563,7 @@ def process_html_file(file_path, conn):
 
 
 def main():
 def main():
     ops_dir = config.get('paths', 'input_dir')
     ops_dir = config.get('paths', 'input_dir')
-    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
+    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.htm') or f.endswith('.html')]
     
     
     # 按文件名排序
     # 按文件名排序
     html_files.sort()
     html_files.sort()

+ 0 - 0
code/translate_epub_v1.py → code/归档/translate_epub_v1.py


+ 0 - 0
code/translate_epub_v2.py → code/归档/translate_epub_v2.py


+ 0 - 0
code/translate_epub_v3.py → code/归档/translate_epub_v3.py


+ 0 - 0
code/translate_epub_v4(单线程版本).py → code/归档/translate_epub_v4(单线程版本).py


+ 0 - 0
code/translate_epub_v4(单线程版本)V2.py → code/归档/translate_epub_v4(单线程版本)V2.py


+ 0 - 0
code/translate_epub_v5(多线程版本).py → code/归档/translate_epub_v5(多线程版本).py