1 kuukausi sitten · 67954afc6c
--- a/003/.gitignore
+++ b/003/.gitignore
@@ -0,0 +1,15 @@
 
				+*.epub
			
 
				+*.zip
			
 
				+*.html
			
 
				+translation_progress.db
			
 
				+*.db
			
 
				+*.log
			
 
				+translation_progress.db
			
 
				+002/.DS_Store
			
 
				+001/.DS_Store
			
 
				+.DS_Store
			
 
				+003/
			
 
				+.idea/.gitignore
			
 
				+.idea/english-to-chinese.iml
			
 
				+.idea/modules.xml
			
 
				+.idea/vcs.xml
			
--- a/003/META-INF/container.xml
+++ b/003/META-INF/container.xml
@@ -0,0 +1 @@
 
				+<?xml version="1.0"?><container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"><rootfiles><rootfile full-path="Ops/content.opf" media-type="application/oebps-package+xml"/></rootfiles></container>
			
--- a/003/clean_html_file.py
+++ b/003/clean_html_file.py
@@ -0,0 +1,33 @@
 
				+import os
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+def clean_html_file(file_path):
			
 
				+    with open(file_path, 'r', encoding='utf-8') as file:
			
 
				+        content = file.read()
			
 
				+
			
 
				+    # 使用 BeautifulSoup 解析 HTML 内容
			
 
				+    soup = BeautifulSoup(content, 'html.parser')
			
 
				+
			
 
				+    # 移除所有空行
			
 
				+    for element in soup.find_all(text=True):
			
 
				+        if element.isspace():  # 移除纯空白的文本节点
			
 
				+            element.extract()
			
 
				+
			
 
				+    # 为了确保保留被标签包裹的内容，我们只移除非标签包裹的文本
			
 
				+    for element in soup.find_all(text=True):
			
 
				+        if element.strip() == "" and element.parent.name not in ['script', 'style']:
			
 
				+            element.extract()
			
 
				+
			
 
				+    # 将处理后的内容写回文件
			
 
				+    with open(file_path, 'w', encoding='utf-8') as file:
			
 
				+        file.write(str(soup))
			
 
				+
			
 
				+def process_directory(directory):
			
 
				+    for filename in os.listdir(directory):
			
 
				+        if filename.endswith('.html'):
			
 
				+            file_path = os.path.join(directory, filename)
			
 
				+            clean_html_file(file_path)
			
 
				+
			
 
				+# 指定要处理的目录路径
			
 
				+directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops'
			
 
				+process_directory(directory_path)
			
--- a/003/mimetype
+++ b/003/mimetype
@@ -0,0 +1 @@
 
				+application/epub+zip
			
--- a/003/process_html_file.py
+++ b/003/process_html_file.py
@@ -0,0 +1,199 @@
 
				+import os
			
 
				+import re
			
 
				+from typing import Optional, List
			
 
				+from pathlib import Path
			
 
				+from bs4 import BeautifulSoup, Doctype, Comment
			
 
				+import logging
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+from functools import partial
			
 
				+
			
 
				+# 配置日志
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format='%(asctime)s - %(levelname)s - %(message)s'
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+class HTMLProcessor:
			
 
				+    """HTML文件处理器类"""
			
 
				+    
			
 
				+    def __init__(self, max_workers: int = 4):
			
 
				+        self.max_workers = max_workers
			
 
				+    
			
 
				+    def _is_explanation_text(self, text: str) -> bool:
			
 
				+        """判断文本是否为说明文本"""
			
 
				+        text = text.strip()
			
 
				+        explanation_patterns = [
			
 
				+            r'^（说明：.*?）$',
			
 
				+            r'^说明：.*?$',
			
 
				+            r'^（注：.*?）$',
			
 
				+            r'^注：.*?$',
			
 
				+            r'^（.*?）$',  # 处理括号内的说明文本
			
 
				+            r'^[0-9]+\.\s.*?$',  # 处理编号说明
			
 
				+            r'^.*?的同时，.*?$',  # 处理"的同时"类型的说明
			
 
				+        ]
			
 
				+        return any(re.match(pattern, text, re.DOTALL) for pattern in explanation_patterns)
			
 
				+    
			
 
				+    def _clean_content(self, content: str) -> str:
			
 
				+        """清理HTML内容中的多余空行和注释"""
			
 
				+        # 移除HTML注释
			
 
				+        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
			
 
				+        # 移除说明文本
			
 
				+        content = re.sub(r'（说明：.*?）', '', content, flags=re.DOTALL)
			
 
				+        content = re.sub(r'说明：.*?$', '', content, flags=re.MULTILINE)
			
 
				+        content = re.sub(r'（注：.*?）', '', content, flags=re.DOTALL)
			
 
				+        content = re.sub(r'注：.*?$', '', content, flags=re.MULTILINE)
			
 
				+        # 移除空行
			
 
				+        content = re.sub(r'\n\s*\n', '\n', content).strip()
			
 
				+        return content
			
 
				+    
			
 
				+    def _clean_soup(self, soup: BeautifulSoup) -> None:
			
 
				+        """清理BeautifulSoup对象中的冗余内容"""
			
 
				+        # 移除所有注释
			
 
				+        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
			
 
				+            comment.extract()
			
 
				+        
			
 
				+        # 移除说明文本
			
 
				+        for text in soup.find_all(text=True):
			
 
				+            if isinstance(text, str) and self._is_explanation_text(text):
			
 
				+                text.extract()
			
 
				+        
			
 
				+        # 清理空的标签
			
 
				+        for tag in soup.find_all():
			
 
				+            if not tag.get_text(strip=True) and not tag.attrs:
			
 
				+                tag.extract()
			
 
				+    
			
 
				+    def _ensure_proper_structure(self, soup: BeautifulSoup) -> BeautifulSoup:
			
 
				+        """确保HTML结构正确"""
			
 
				+        # 创建新的soup对象
			
 
				+        new_soup = BeautifulSoup('', 'html.parser')
			
 
				+        
			
 
				+        # 处理DOCTYPE
			
 
				+        doctype = None
			
 
				+        for item in soup.contents:
			
 
				+            if isinstance(item, Doctype):
			
 
				+                doctype = item
			
 
				+                break
			
 
				+        if doctype:
			
 
				+            new_soup.append(doctype)
			
 
				+        
			
 
				+        # 创建html标签
			
 
				+        html_tag = soup.new_tag('html')
			
 
				+        new_soup.append(html_tag)
			
 
				+        
			
 
				+        # 处理head标签
			
 
				+        head_tag = soup.find('head')
			
 
				+        if head_tag:
			
 
				+            html_tag.append(head_tag)
			
 
				+        else:
			
 
				+            head_tag = soup.new_tag('head')
			
 
				+            html_tag.append(head_tag)
			
 
				+        
			
 
				+        # 处理body标签
			
 
				+        body_tag = soup.find('body')
			
 
				+        if body_tag:
			
 
				+            html_tag.append(body_tag)
			
 
				+        else:
			
 
				+            body_tag = soup.new_tag('body')
			
 
				+            html_tag.append(body_tag)
			
 
				+        
			
 
				+        # 将内容移动到body中
			
 
				+        for element in soup.find_all(recursive=False):
			
 
				+            if element.name not in ['html', 'head', 'body']:
			
 
				+                element.extract()
			
 
				+                body_tag.append(element)
			
 
				+        
			
 
				+        return new_soup
			
 
				+    
			
 
				+    def _process_unwrapped_content(self, soup: BeautifulSoup) -> None:
			
 
				+        """处理未包裹的内容"""
			
 
				+        body_tag = soup.body
			
 
				+        if not body_tag:
			
 
				+            if not soup.html:
			
 
				+                soup = BeautifulSoup('<html><head></head><body></body></html>', 'html.parser')
			
 
				+            else:
			
 
				+                soup.html.append(soup.new_tag('body'))
			
 
				+            body_tag = soup.body
			
 
				+        
			
 
				+        for element in list(soup.html.contents):
			
 
				+            if element.name != 'body' and element != body_tag:
			
 
				+                if element.name:
			
 
				+                    element.extract()
			
 
				+                    body_tag.append(element)
			
 
				+                elif str(element).strip():
			
 
				+                    text = str(element).strip()
			
 
				+                    if not self._is_explanation_text(text):
			
 
				+                        new_p = soup.new_tag('p')
			
 
				+                        new_p.string = text
			
 
				+                        element.replace_with(new_p)
			
 
				+                        body_tag.append(new_p)
			
 
				+                    else:
			
 
				+                        element.extract()
			
 
				+    
			
 
				+    def process_file(self, file_path: Path) -> bool:
			
 
				+        """处理单个HTML文件"""
			
 
				+        try:
			
 
				+            with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                content = f.read()
			
 
				+            
			
 
				+            # 清理内容
			
 
				+            content = self._clean_content(content)
			
 
				+            
			
 
				+            # 解析HTML
			
 
				+            soup = BeautifulSoup(content, 'html.parser')
			
 
				+            
			
 
				+            # 清理soup对象
			
 
				+            self._clean_soup(soup)
			
 
				+            
			
 
				+            # 确保HTML结构正确
			
 
				+            soup = self._ensure_proper_structure(soup)
			
 
				+            
			
 
				+            # 处理未包裹内容
			
 
				+            self._process_unwrapped_content(soup)
			
 
				+            
			
 
				+            # 输出结果
			
 
				+            result = self._clean_content(str(soup))
			
 
				+            
			
 
				+            # 写回文件
			
 
				+            with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+                f.write(result)
			
 
				+            
			
 
				+            logger.info(f"成功处理: {file_path}")
			
 
				+            return True
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"处理失败 {file_path}: {str(e)}")
			
 
				+            return False
			
 
				+    
			
 
				+    def process_directory(self, directory: str) -> None:
			
 
				+        """处理目录中的所有HTML文件"""
			
 
				+        directory_path = Path(directory)
			
 
				+        if not directory_path.is_dir():
			
 
				+            logger.error(f"错误: 目录不存在 - {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        html_files = list(directory_path.rglob("*.html"))
			
 
				+        if not html_files:
			
 
				+            logger.warning(f"未在目录中找到HTML文件: {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        success_count = 0
			
 
				+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
			
 
				+            results = list(executor.map(self.process_file, html_files))
			
 
				+            success_count = sum(1 for result in results if result)
			
 
				+        
			
 
				+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    import sys
			
 
				+    if len(sys.argv) > 1:
			
 
				+        target_directory = sys.argv[1]
			
 
				+    else:
			
 
				+        target_directory = input("请输入要处理的目录路径: ")
			
 
				+    
			
 
				+    processor = HTMLProcessor()
			
 
				+    processor.process_directory(target_directory)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/003/process_html_file_v2.py
+++ b/003/process_html_file_v2.py
@@ -0,0 +1,118 @@
 
				+
			
 
				+from pathlib import Path
			
 
				+from bs4 import BeautifulSoup
			
 
				+import logging
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+# 配置日志
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format='%(asctime)s - %(levelname)s - %(message)s'
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+class HTMLProcessor:
			
 
				+    """HTML文件处理器类"""
			
 
				+    
			
 
				+    def __init__(self, max_workers: int = 4):
			
 
				+        self.max_workers = max_workers
			
 
				+    
			
 
				+    """ 过滤冗余文本 """
			
 
				+    def _filter_redundant_text(self, file_path: str) -> str:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                content = f.read()
			
 
				+        content = content.replace('\n</p>', '</p>')
			
 
				+        content = content.replace('</body>', '\nb</body>')
			
 
				+        content = content.replace('<h2', '\n<h2')
			
 
				+        for text in content.split('\n'):
			
 
				+            if text.find('p34"') == 0 :
			
 
				+                content = content.replace(text, '<p class="'+text)
			
 
				+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
			
 
				+                content = content.replace(text, '')
			
 
				+            if text.strip() == '':
			
 
				+                content = content.replace(text, '')
			
 
				+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
			
 
				+                content = content.replace(text, '')
			
 
				+        content = content.replace('<p', '\n\n<p')
			
 
				+        # 写回文件
			
 
				+        with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+            f.write(content)
			
 
				+        # content = content.replace('\n</p>', '</p>')
			
 
				+        # return content
			
 
				+    
			
 
				+    def _clean_soup(self, soup: BeautifulSoup) -> None:
			
 
				+        """清理BeautifulSoup对象中的冗余内容
			
 
				+        
			
 
				+        该方法会清理以下内容：
			
 
				+        1. 移除所有HTML注释
			
 
				+        2. 移除直接位于body或html标签下的纯文本节点
			
 
				+        3. 保留所有被标签包裹的文本内容
			
 
				+        
			
 
				+        Args:
			
 
				+            soup: BeautifulSoup对象
			
 
				+        """
			
 
				+        soup.html.attrs['xml:lang'] = 'zh-CN'
			
 
				+        for element in soup.body.children:
			
 
				+            print("\n\n<<<===========================================\n")
			
 
				+            print(element, element.name);
			
 
				+            if element.name == None:
			
 
				+                # element.extract() 原来的地方使用 换行符号
			
 
				+                element.replace_with('\n')
			
 
				+            print("===========================================>>>\n\n")
			
 
				+      
			
 
				+    def process_file(self, file_path: Path) -> bool:
			
 
				+        """处理单个HTML文件"""
			
 
				+        try:
			
 
				+            self._filter_redundant_text(file_path);
			
 
				+            with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                content = f.read()
			
 
				+            
			
 
				+            # 解析HTML
			
 
				+            soup = BeautifulSoup(content, 'html.parser')
			
 
				+            
			
 
				+            # 清理soup对象
			
 
				+            self._clean_soup(soup)
			
 
				+            result = str(soup)
			
 
				+            
			
 
				+            # 写回文件
			
 
				+            with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+                f.write(result)
			
 
				+            
			
 
				+            logger.info(f"成功处理: {file_path}")
			
 
				+            return True
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"处理失败 {file_path}: {str(e)}")
			
 
				+            return False
			
 
				+    
			
 
				+    def process_directory(self, directory: str) -> None:
			
 
				+        """处理目录中的所有HTML文件"""
			
 
				+        directory_path = Path(directory)
			
 
				+        if not directory_path.is_dir():
			
 
				+            logger.error(f"错误: 目录不存在 - {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        html_files = list(directory_path.rglob("*.html"))
			
 
				+        if not html_files:
			
 
				+            logger.warning(f"未在目录中找到HTML文件: {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        success_count = 0
			
 
				+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
			
 
				+            results = list(executor.map(self.process_file, html_files))
			
 
				+            success_count = sum(1 for result in results if result)
			
 
				+        
			
 
				+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # import sys
			
 
				+    # if len(sys.argv) > 1:
			
 
				+    #     target_directory = sys.argv[1]
			
 
				+    # else:
			
 
				+    #     target_directory = input("请输入要处理的目录路径: ")
			
 
				+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
			
 
				+    processor = HTMLProcessor()
			
 
				+    processor.process_directory(target_directory)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/003/process_html_file_v3.py
+++ b/003/process_html_file_v3.py
@@ -0,0 +1,137 @@
 
				+
			
 
				+from pathlib import Path
			
 
				+from bs4 import BeautifulSoup
			
 
				+import logging
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+import ast
			
 
				+from lxml import etree
			
 
				+
			
 
				+# 配置日志
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format='%(asctime)s - %(levelname)s - %(message)s'
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+class HTMLProcessor:
			
 
				+    """HTML文件处理器类"""
			
 
				+    
			
 
				+    def __init__(self, max_workers: int = 4):
			
 
				+        self.max_workers = max_workers
			
 
				+    
			
 
				+    """ 过滤冗余文本 """
			
 
				+    def _filter_redundant_text(self, file_path: str) -> str:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                content = f.read()
			
 
				+        content = content.replace('\n</p>', '</p>')
			
 
				+        content = content.replace('</body>', '\nb</body>')
			
 
				+        content = content.replace('<h2', '\n<h2')
			
 
				+        for text in content.split('\n'):
			
 
				+            if text.find('p34"') == 0 :
			
 
				+                content = content.replace(text, '<p class="'+text)
			
 
				+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
			
 
				+                content = content.replace(text, '')
			
 
				+            if text.strip() == '':
			
 
				+                content = content.replace(text, '')
			
 
				+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
			
 
				+                content = content.replace(text, '')
			
 
				+        content = content.replace('<p', '\n\n<p')
			
 
				+        # 写回文件
			
 
				+        with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+            f.write(content)
			
 
				+        # content = content.replace('\n</p>', '</p>')
			
 
				+        # return content
			
 
				+    
			
 
				+    def _clean_soup(self, soup: BeautifulSoup) -> None:
			
 
				+        """清理BeautifulSoup对象中的冗余内容
			
 
				+        
			
 
				+        该方法会清理以下内容：
			
 
				+        1. 移除所有HTML注释
			
 
				+        2. 移除直接位于body或html标签下的纯文本节点
			
 
				+        3. 保留所有被标签包裹的文本内容
			
 
				+        
			
 
				+        Args:
			
 
				+            soup: BeautifulSoup对象
			
 
				+        """
			
 
				+        soup.html.attrs['xml:lang'] = 'zh-CN'
			
 
				+        for element in soup.body.children:
			
 
				+            print("\n\n<<<===========================================\n")
			
 
				+            print(element, element.name);
			
 
				+            if element.name == None:
			
 
				+                # element.extract() 原来的地方使用 换行符号
			
 
				+                element.replace_with('\n')
			
 
				+            print("===========================================>>>\n\n")
			
 
				+      
			
 
				+    def process_file(self, file_path: Path) -> bool:
			
 
				+        """处理单个HTML文件"""
			
 
				+        try:
			
 
				+            self.ast_html(file_path);
			
 
				+            # self._filter_redundant_text(file_path);
			
 
				+            # with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            #     content = f.read()
			
 
				+            
			
 
				+            # # 解析HTML
			
 
				+            # soup = BeautifulSoup(content, 'html.parser')
			
 
				+            
			
 
				+            # # 清理soup对象
			
 
				+            # self._clean_soup(soup)
			
 
				+            # result = str(soup)
			
 
				+            
			
 
				+            # # 写回文件
			
 
				+            # with open(file_path, 'w', encoding='utf-8') as f:
			
 
				+            #     f.write(result)
			
 
				+            
			
 
				+            logger.info(f"成功处理: {file_path}")
			
 
				+            return True
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"处理失败 {file_path}: {str(e)}")
			
 
				+            return False
			
 
				+    
			
 
				+    def ast_html(self, file_path: Path) -> None:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            content = f.read()
			
 
				+        # 转换为字节
			
 
				+        html_bytes = content.encode('utf-8')  # 明确指定编码
			
 
				+        tree = etree.HTML(html_bytes)
			
 
				+        body = tree.xpath('//body')[0]
			
 
				+        print(body)
			
 
				+        for element in body.iter():
			
 
				+            print(element.tag, element.text)
			
 
				+        # print(ast.literal_eval(content))
			
 
				+        # print(ast.literal_eval(content))
			
 
				+        # soup = BeautifulSoup(content, 'html.parser')
			
 
				+        # return str(soup)
			
 
				+    
			
 
				+    def process_directory(self, directory: str) -> None:
			
 
				+        """处理目录中的所有HTML文件"""
			
 
				+        directory_path = Path(directory)
			
 
				+        if not directory_path.is_dir():
			
 
				+            logger.error(f"错误: 目录不存在 - {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        html_files = list(directory_path.rglob("*.html"))
			
 
				+        if not html_files:
			
 
				+            logger.warning(f"未在目录中找到HTML文件: {directory}")
			
 
				+            return
			
 
				+        
			
 
				+        success_count = 0
			
 
				+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
			
 
				+            results = list(executor.map(self.process_file, html_files))
			
 
				+            success_count = sum(1 for result in results if result)
			
 
				+        
			
 
				+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # import sys
			
 
				+    # if len(sys.argv) > 1:
			
 
				+    #     target_directory = sys.argv[1]
			
 
				+    # else:
			
 
				+    #     target_directory = input("请输入要处理的目录路径: ")
			
 
				+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
			
 
				+    processor = HTMLProcessor()
			
 
				+    processor.process_directory(target_directory)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()