1 miesiąc temu · 67954afc6c
--- a/003/.gitignore
+++ b/003/.gitignore
@@ -0,0 +1,15 @@
 
															+*.epub
														
 
															+*.zip
														
 
															+*.html
														
 
															+translation_progress.db
														
 
															+*.db
														
 
															+*.log
														
 
															+translation_progress.db
														
 
															+002/.DS_Store
														
 
															+001/.DS_Store
														
 
															+.DS_Store
														
 
															+003/
														
 
															+.idea/.gitignore
														
 
															+.idea/english-to-chinese.iml
														
 
															+.idea/modules.xml
														
 
															+.idea/vcs.xml
														
--- a/003/META-INF/container.xml
+++ b/003/META-INF/container.xml
@@ -0,0 +1 @@
 
															+<?xml version="1.0"?><container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"><rootfiles><rootfile full-path="Ops/content.opf" media-type="application/oebps-package+xml"/></rootfiles></container>
														
--- a/003/clean_html_file.py
+++ b/003/clean_html_file.py
@@ -0,0 +1,33 @@
 
															+import os
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															+def clean_html_file(file_path):
														
 
															+    with open(file_path, 'r', encoding='utf-8') as file:
														
 
															+        content = file.read()
														
 
															+
														
 
															+    # 使用 BeautifulSoup 解析 HTML 内容
														
 
															+    soup = BeautifulSoup(content, 'html.parser')
														
 
															+
														
 
															+    # 移除所有空行
														
 
															+    for element in soup.find_all(text=True):
														
 
															+        if element.isspace():  # 移除纯空白的文本节点
														
 
															+            element.extract()
														
 
															+
														
 
															+    # 为了确保保留被标签包裹的内容，我们只移除非标签包裹的文本
														
 
															+    for element in soup.find_all(text=True):
														
 
															+        if element.strip() == "" and element.parent.name not in ['script', 'style']:
														
 
															+            element.extract()
														
 
															+
														
 
															+    # 将处理后的内容写回文件
														
 
															+    with open(file_path, 'w', encoding='utf-8') as file:
														
 
															+        file.write(str(soup))
														
 
															+
														
 
															+def process_directory(directory):
														
 
															+    for filename in os.listdir(directory):
														
 
															+        if filename.endswith('.html'):
														
 
															+            file_path = os.path.join(directory, filename)
														
 
															+            clean_html_file(file_path)
														
 
															+
														
 
															+# 指定要处理的目录路径
														
 
															+directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops'
														
 
															+process_directory(directory_path)
														
--- a/003/mimetype
+++ b/003/mimetype
@@ -0,0 +1 @@
 
															+application/epub+zip
														
--- a/003/process_html_file.py
+++ b/003/process_html_file.py
@@ -0,0 +1,199 @@
 
															+import os
														
 
															+import re
														
 
															+from typing import Optional, List
														
 
															+from pathlib import Path
														
 
															+from bs4 import BeautifulSoup, Doctype, Comment
														
 
															+import logging
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+from functools import partial
														
 
															+
														
 
															+# 配置日志
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format='%(asctime)s - %(levelname)s - %(message)s'
														
 
															+)
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+class HTMLProcessor:
														
 
															+    """HTML文件处理器类"""
														
 
															+    
														
 
															+    def __init__(self, max_workers: int = 4):
														
 
															+        self.max_workers = max_workers
														
 
															+    
														
 
															+    def _is_explanation_text(self, text: str) -> bool:
														
 
															+        """判断文本是否为说明文本"""
														
 
															+        text = text.strip()
														
 
															+        explanation_patterns = [
														
 
															+            r'^（说明：.*?）$',
														
 
															+            r'^说明：.*?$',
														
 
															+            r'^（注：.*?）$',
														
 
															+            r'^注：.*?$',
														
 
															+            r'^（.*?）$',  # 处理括号内的说明文本
														
 
															+            r'^[0-9]+\.\s.*?$',  # 处理编号说明
														
 
															+            r'^.*?的同时，.*?$',  # 处理"的同时"类型的说明
														
 
															+        ]
														
 
															+        return any(re.match(pattern, text, re.DOTALL) for pattern in explanation_patterns)
														
 
															+    
														
 
															+    def _clean_content(self, content: str) -> str:
														
 
															+        """清理HTML内容中的多余空行和注释"""
														
 
															+        # 移除HTML注释
														
 
															+        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
														
 
															+        # 移除说明文本
														
 
															+        content = re.sub(r'（说明：.*?）', '', content, flags=re.DOTALL)
														
 
															+        content = re.sub(r'说明：.*?$', '', content, flags=re.MULTILINE)
														
 
															+        content = re.sub(r'（注：.*?）', '', content, flags=re.DOTALL)
														
 
															+        content = re.sub(r'注：.*?$', '', content, flags=re.MULTILINE)
														
 
															+        # 移除空行
														
 
															+        content = re.sub(r'\n\s*\n', '\n', content).strip()
														
 
															+        return content
														
 
															+    
														
 
															+    def _clean_soup(self, soup: BeautifulSoup) -> None:
														
 
															+        """清理BeautifulSoup对象中的冗余内容"""
														
 
															+        # 移除所有注释
														
 
															+        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
														
 
															+            comment.extract()
														
 
															+        
														
 
															+        # 移除说明文本
														
 
															+        for text in soup.find_all(text=True):
														
 
															+            if isinstance(text, str) and self._is_explanation_text(text):
														
 
															+                text.extract()
														
 
															+        
														
 
															+        # 清理空的标签
														
 
															+        for tag in soup.find_all():
														
 
															+            if not tag.get_text(strip=True) and not tag.attrs:
														
 
															+                tag.extract()
														
 
															+    
														
 
															+    def _ensure_proper_structure(self, soup: BeautifulSoup) -> BeautifulSoup:
														
 
															+        """确保HTML结构正确"""
														
 
															+        # 创建新的soup对象
														
 
															+        new_soup = BeautifulSoup('', 'html.parser')
														
 
															+        
														
 
															+        # 处理DOCTYPE
														
 
															+        doctype = None
														
 
															+        for item in soup.contents:
														
 
															+            if isinstance(item, Doctype):
														
 
															+                doctype = item
														
 
															+                break
														
 
															+        if doctype:
														
 
															+            new_soup.append(doctype)
														
 
															+        
														
 
															+        # 创建html标签
														
 
															+        html_tag = soup.new_tag('html')
														
 
															+        new_soup.append(html_tag)
														
 
															+        
														
 
															+        # 处理head标签
														
 
															+        head_tag = soup.find('head')
														
 
															+        if head_tag:
														
 
															+            html_tag.append(head_tag)
														
 
															+        else:
														
 
															+            head_tag = soup.new_tag('head')
														
 
															+            html_tag.append(head_tag)
														
 
															+        
														
 
															+        # 处理body标签
														
 
															+        body_tag = soup.find('body')
														
 
															+        if body_tag:
														
 
															+            html_tag.append(body_tag)
														
 
															+        else:
														
 
															+            body_tag = soup.new_tag('body')
														
 
															+            html_tag.append(body_tag)
														
 
															+        
														
 
															+        # 将内容移动到body中
														
 
															+        for element in soup.find_all(recursive=False):
														
 
															+            if element.name not in ['html', 'head', 'body']:
														
 
															+                element.extract()
														
 
															+                body_tag.append(element)
														
 
															+        
														
 
															+        return new_soup
														
 
															+    
														
 
															+    def _process_unwrapped_content(self, soup: BeautifulSoup) -> None:
														
 
															+        """处理未包裹的内容"""
														
 
															+        body_tag = soup.body
														
 
															+        if not body_tag:
														
 
															+            if not soup.html:
														
 
															+                soup = BeautifulSoup('<html><head></head><body></body></html>', 'html.parser')
														
 
															+            else:
														
 
															+                soup.html.append(soup.new_tag('body'))
														
 
															+            body_tag = soup.body
														
 
															+        
														
 
															+        for element in list(soup.html.contents):
														
 
															+            if element.name != 'body' and element != body_tag:
														
 
															+                if element.name:
														
 
															+                    element.extract()
														
 
															+                    body_tag.append(element)
														
 
															+                elif str(element).strip():
														
 
															+                    text = str(element).strip()
														
 
															+                    if not self._is_explanation_text(text):
														
 
															+                        new_p = soup.new_tag('p')
														
 
															+                        new_p.string = text
														
 
															+                        element.replace_with(new_p)
														
 
															+                        body_tag.append(new_p)
														
 
															+                    else:
														
 
															+                        element.extract()
														
 
															+    
														
 
															+    def process_file(self, file_path: Path) -> bool:
														
 
															+        """处理单个HTML文件"""
														
 
															+        try:
														
 
															+            with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+                content = f.read()
														
 
															+            
														
 
															+            # 清理内容
														
 
															+            content = self._clean_content(content)
														
 
															+            
														
 
															+            # 解析HTML
														
 
															+            soup = BeautifulSoup(content, 'html.parser')
														
 
															+            
														
 
															+            # 清理soup对象
														
 
															+            self._clean_soup(soup)
														
 
															+            
														
 
															+            # 确保HTML结构正确
														
 
															+            soup = self._ensure_proper_structure(soup)
														
 
															+            
														
 
															+            # 处理未包裹内容
														
 
															+            self._process_unwrapped_content(soup)
														
 
															+            
														
 
															+            # 输出结果
														
 
															+            result = self._clean_content(str(soup))
														
 
															+            
														
 
															+            # 写回文件
														
 
															+            with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+                f.write(result)
														
 
															+            
														
 
															+            logger.info(f"成功处理: {file_path}")
														
 
															+            return True
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"处理失败 {file_path}: {str(e)}")
														
 
															+            return False
														
 
															+    
														
 
															+    def process_directory(self, directory: str) -> None:
														
 
															+        """处理目录中的所有HTML文件"""
														
 
															+        directory_path = Path(directory)
														
 
															+        if not directory_path.is_dir():
														
 
															+            logger.error(f"错误: 目录不存在 - {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        html_files = list(directory_path.rglob("*.html"))
														
 
															+        if not html_files:
														
 
															+            logger.warning(f"未在目录中找到HTML文件: {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        success_count = 0
														
 
															+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
														
 
															+            results = list(executor.map(self.process_file, html_files))
														
 
															+            success_count = sum(1 for result in results if result)
														
 
															+        
														
 
															+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
														
 
															+
														
 
															+def main():
														
 
															+    """主函数"""
														
 
															+    import sys
														
 
															+    if len(sys.argv) > 1:
														
 
															+        target_directory = sys.argv[1]
														
 
															+    else:
														
 
															+        target_directory = input("请输入要处理的目录路径: ")
														
 
															+    
														
 
															+    processor = HTMLProcessor()
														
 
															+    processor.process_directory(target_directory)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/003/process_html_file_v2.py
+++ b/003/process_html_file_v2.py
@@ -0,0 +1,118 @@
 
															+
														
 
															+from pathlib import Path
														
 
															+from bs4 import BeautifulSoup
														
 
															+import logging
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+# 配置日志
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format='%(asctime)s - %(levelname)s - %(message)s'
														
 
															+)
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+class HTMLProcessor:
														
 
															+    """HTML文件处理器类"""
														
 
															+    
														
 
															+    def __init__(self, max_workers: int = 4):
														
 
															+        self.max_workers = max_workers
														
 
															+    
														
 
															+    """ 过滤冗余文本 """
														
 
															+    def _filter_redundant_text(self, file_path: str) -> str:
														
 
															+        with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+                content = f.read()
														
 
															+        content = content.replace('\n</p>', '</p>')
														
 
															+        content = content.replace('</body>', '\nb</body>')
														
 
															+        content = content.replace('<h2', '\n<h2')
														
 
															+        for text in content.split('\n'):
														
 
															+            if text.find('p34"') == 0 :
														
 
															+                content = content.replace(text, '<p class="'+text)
														
 
															+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
														
 
															+                content = content.replace(text, '')
														
 
															+            if text.strip() == '':
														
 
															+                content = content.replace(text, '')
														
 
															+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
														
 
															+                content = content.replace(text, '')
														
 
															+        content = content.replace('<p', '\n\n<p')
														
 
															+        # 写回文件
														
 
															+        with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+            f.write(content)
														
 
															+        # content = content.replace('\n</p>', '</p>')
														
 
															+        # return content
														
 
															+    
														
 
															+    def _clean_soup(self, soup: BeautifulSoup) -> None:
														
 
															+        """清理BeautifulSoup对象中的冗余内容
														
 
															+        
														
 
															+        该方法会清理以下内容：
														
 
															+        1. 移除所有HTML注释
														
 
															+        2. 移除直接位于body或html标签下的纯文本节点
														
 
															+        3. 保留所有被标签包裹的文本内容
														
 
															+        
														
 
															+        Args:
														
 
															+            soup: BeautifulSoup对象
														
 
															+        """
														
 
															+        soup.html.attrs['xml:lang'] = 'zh-CN'
														
 
															+        for element in soup.body.children:
														
 
															+            print("\n\n<<<===========================================\n")
														
 
															+            print(element, element.name);
														
 
															+            if element.name == None:
														
 
															+                # element.extract() 原来的地方使用 换行符号
														
 
															+                element.replace_with('\n')
														
 
															+            print("===========================================>>>\n\n")
														
 
															+      
														
 
															+    def process_file(self, file_path: Path) -> bool:
														
 
															+        """处理单个HTML文件"""
														
 
															+        try:
														
 
															+            self._filter_redundant_text(file_path);
														
 
															+            with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+                content = f.read()
														
 
															+            
														
 
															+            # 解析HTML
														
 
															+            soup = BeautifulSoup(content, 'html.parser')
														
 
															+            
														
 
															+            # 清理soup对象
														
 
															+            self._clean_soup(soup)
														
 
															+            result = str(soup)
														
 
															+            
														
 
															+            # 写回文件
														
 
															+            with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+                f.write(result)
														
 
															+            
														
 
															+            logger.info(f"成功处理: {file_path}")
														
 
															+            return True
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"处理失败 {file_path}: {str(e)}")
														
 
															+            return False
														
 
															+    
														
 
															+    def process_directory(self, directory: str) -> None:
														
 
															+        """处理目录中的所有HTML文件"""
														
 
															+        directory_path = Path(directory)
														
 
															+        if not directory_path.is_dir():
														
 
															+            logger.error(f"错误: 目录不存在 - {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        html_files = list(directory_path.rglob("*.html"))
														
 
															+        if not html_files:
														
 
															+            logger.warning(f"未在目录中找到HTML文件: {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        success_count = 0
														
 
															+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
														
 
															+            results = list(executor.map(self.process_file, html_files))
														
 
															+            success_count = sum(1 for result in results if result)
														
 
															+        
														
 
															+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
														
 
															+
														
 
															+def main():
														
 
															+    """主函数"""
														
 
															+    # import sys
														
 
															+    # if len(sys.argv) > 1:
														
 
															+    #     target_directory = sys.argv[1]
														
 
															+    # else:
														
 
															+    #     target_directory = input("请输入要处理的目录路径: ")
														
 
															+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
														
 
															+    processor = HTMLProcessor()
														
 
															+    processor.process_directory(target_directory)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/003/process_html_file_v3.py
+++ b/003/process_html_file_v3.py
@@ -0,0 +1,137 @@
 
															+
														
 
															+from pathlib import Path
														
 
															+from bs4 import BeautifulSoup
														
 
															+import logging
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+import ast
														
 
															+from lxml import etree
														
 
															+
														
 
															+# 配置日志
														
 
															+logging.basicConfig(
														
 
															+    level=logging.INFO,
														
 
															+    format='%(asctime)s - %(levelname)s - %(message)s'
														
 
															+)
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+class HTMLProcessor:
														
 
															+    """HTML文件处理器类"""
														
 
															+    
														
 
															+    def __init__(self, max_workers: int = 4):
														
 
															+        self.max_workers = max_workers
														
 
															+    
														
 
															+    """ 过滤冗余文本 """
														
 
															+    def _filter_redundant_text(self, file_path: str) -> str:
														
 
															+        with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+                content = f.read()
														
 
															+        content = content.replace('\n</p>', '</p>')
														
 
															+        content = content.replace('</body>', '\nb</body>')
														
 
															+        content = content.replace('<h2', '\n<h2')
														
 
															+        for text in content.split('\n'):
														
 
															+            if text.find('p34"') == 0 :
														
 
															+                content = content.replace(text, '<p class="'+text)
														
 
															+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
														
 
															+                content = content.replace(text, '')
														
 
															+            if text.strip() == '':
														
 
															+                content = content.replace(text, '')
														
 
															+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
														
 
															+                content = content.replace(text, '')
														
 
															+        content = content.replace('<p', '\n\n<p')
														
 
															+        # 写回文件
														
 
															+        with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+            f.write(content)
														
 
															+        # content = content.replace('\n</p>', '</p>')
														
 
															+        # return content
														
 
															+    
														
 
															+    def _clean_soup(self, soup: BeautifulSoup) -> None:
														
 
															+        """清理BeautifulSoup对象中的冗余内容
														
 
															+        
														
 
															+        该方法会清理以下内容：
														
 
															+        1. 移除所有HTML注释
														
 
															+        2. 移除直接位于body或html标签下的纯文本节点
														
 
															+        3. 保留所有被标签包裹的文本内容
														
 
															+        
														
 
															+        Args:
														
 
															+            soup: BeautifulSoup对象
														
 
															+        """
														
 
															+        soup.html.attrs['xml:lang'] = 'zh-CN'
														
 
															+        for element in soup.body.children:
														
 
															+            print("\n\n<<<===========================================\n")
														
 
															+            print(element, element.name);
														
 
															+            if element.name == None:
														
 
															+                # element.extract() 原来的地方使用 换行符号
														
 
															+                element.replace_with('\n')
														
 
															+            print("===========================================>>>\n\n")
														
 
															+      
														
 
															+    def process_file(self, file_path: Path) -> bool:
														
 
															+        """处理单个HTML文件"""
														
 
															+        try:
														
 
															+            self.ast_html(file_path);
														
 
															+            # self._filter_redundant_text(file_path);
														
 
															+            # with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+            #     content = f.read()
														
 
															+            
														
 
															+            # # 解析HTML
														
 
															+            # soup = BeautifulSoup(content, 'html.parser')
														
 
															+            
														
 
															+            # # 清理soup对象
														
 
															+            # self._clean_soup(soup)
														
 
															+            # result = str(soup)
														
 
															+            
														
 
															+            # # 写回文件
														
 
															+            # with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+            #     f.write(result)
														
 
															+            
														
 
															+            logger.info(f"成功处理: {file_path}")
														
 
															+            return True
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"处理失败 {file_path}: {str(e)}")
														
 
															+            return False
														
 
															+    
														
 
															+    def ast_html(self, file_path: Path) -> None:
														
 
															+        with open(file_path, 'r', encoding='utf-8') as f:
														
 
															+            content = f.read()
														
 
															+        # 转换为字节
														
 
															+        html_bytes = content.encode('utf-8')  # 明确指定编码
														
 
															+        tree = etree.HTML(html_bytes)
														
 
															+        body = tree.xpath('//body')[0]
														
 
															+        print(body)
														
 
															+        for element in body.iter():
														
 
															+            print(element.tag, element.text)
														
 
															+        # print(ast.literal_eval(content))
														
 
															+        # print(ast.literal_eval(content))
														
 
															+        # soup = BeautifulSoup(content, 'html.parser')
														
 
															+        # return str(soup)
														
 
															+    
														
 
															+    def process_directory(self, directory: str) -> None:
														
 
															+        """处理目录中的所有HTML文件"""
														
 
															+        directory_path = Path(directory)
														
 
															+        if not directory_path.is_dir():
														
 
															+            logger.error(f"错误: 目录不存在 - {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        html_files = list(directory_path.rglob("*.html"))
														
 
															+        if not html_files:
														
 
															+            logger.warning(f"未在目录中找到HTML文件: {directory}")
														
 
															+            return
														
 
															+        
														
 
															+        success_count = 0
														
 
															+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
														
 
															+            results = list(executor.map(self.process_file, html_files))
														
 
															+            success_count = sum(1 for result in results if result)
														
 
															+        
														
 
															+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
														
 
															+
														
 
															+def main():
														
 
															+    """主函数"""
														
 
															+    # import sys
														
 
															+    # if len(sys.argv) > 1:
														
 
															+    #     target_directory = sys.argv[1]
														
 
															+    # else:
														
 
															+    #     target_directory = input("请输入要处理的目录路径: ")
														
 
															+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
														
 
															+    processor = HTMLProcessor()
														
 
															+    processor.process_directory(target_directory)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()