max 1 kuukausi sitten
vanhempi
sitoutus
67954afc6c

+ 15 - 0
003/.gitignore

@@ -0,0 +1,15 @@
+*.epub
+*.zip
+*.html
+translation_progress.db
+*.db
+*.log
+translation_progress.db
+002/.DS_Store
+001/.DS_Store
+.DS_Store
+003/
+.idea/.gitignore
+.idea/english-to-chinese.iml
+.idea/modules.xml
+.idea/vcs.xml

+ 1 - 0
003/META-INF/container.xml

@@ -0,0 +1 @@
+<?xml version="1.0"?><container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"><rootfiles><rootfile full-path="Ops/content.opf" media-type="application/oebps-package+xml"/></rootfiles></container>

+ 33 - 0
003/clean_html_file.py

@@ -0,0 +1,33 @@
+import os
+from bs4 import BeautifulSoup
+
+def clean_html_file(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read()
+
+    # 使用 BeautifulSoup 解析 HTML 内容
+    soup = BeautifulSoup(content, 'html.parser')
+
+    # 移除所有空行
+    for element in soup.find_all(text=True):
+        if element.isspace():  # 移除纯空白的文本节点
+            element.extract()
+
+    # 为了确保保留被标签包裹的内容,我们只移除非标签包裹的文本
+    for element in soup.find_all(text=True):
+        if element.strip() == "" and element.parent.name not in ['script', 'style']:
+            element.extract()
+
+    # 将处理后的内容写回文件
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(str(soup))
+
+def process_directory(directory):
+    for filename in os.listdir(directory):
+        if filename.endswith('.html'):
+            file_path = os.path.join(directory, filename)
+            clean_html_file(file_path)
+
+# 指定要处理的目录路径
+directory_path = '/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops'
+process_directory(directory_path)

+ 1 - 0
003/mimetype

@@ -0,0 +1 @@
+application/epub+zip

+ 199 - 0
003/process_html_file.py

@@ -0,0 +1,199 @@
+import os
+import re
+from typing import Optional, List
+from pathlib import Path
+from bs4 import BeautifulSoup, Doctype, Comment
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class HTMLProcessor:
+    """HTML文件处理器类"""
+    
+    def __init__(self, max_workers: int = 4):
+        self.max_workers = max_workers
+    
+    def _is_explanation_text(self, text: str) -> bool:
+        """判断文本是否为说明文本"""
+        text = text.strip()
+        explanation_patterns = [
+            r'^(说明:.*?)$',
+            r'^说明:.*?$',
+            r'^(注:.*?)$',
+            r'^注:.*?$',
+            r'^(.*?)$',  # 处理括号内的说明文本
+            r'^[0-9]+\.\s.*?$',  # 处理编号说明
+            r'^.*?的同时,.*?$',  # 处理"的同时"类型的说明
+        ]
+        return any(re.match(pattern, text, re.DOTALL) for pattern in explanation_patterns)
+    
+    def _clean_content(self, content: str) -> str:
+        """清理HTML内容中的多余空行和注释"""
+        # 移除HTML注释
+        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
+        # 移除说明文本
+        content = re.sub(r'(说明:.*?)', '', content, flags=re.DOTALL)
+        content = re.sub(r'说明:.*?$', '', content, flags=re.MULTILINE)
+        content = re.sub(r'(注:.*?)', '', content, flags=re.DOTALL)
+        content = re.sub(r'注:.*?$', '', content, flags=re.MULTILINE)
+        # 移除空行
+        content = re.sub(r'\n\s*\n', '\n', content).strip()
+        return content
+    
+    def _clean_soup(self, soup: BeautifulSoup) -> None:
+        """清理BeautifulSoup对象中的冗余内容"""
+        # 移除所有注释
+        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        
+        # 移除说明文本
+        for text in soup.find_all(text=True):
+            if isinstance(text, str) and self._is_explanation_text(text):
+                text.extract()
+        
+        # 清理空的标签
+        for tag in soup.find_all():
+            if not tag.get_text(strip=True) and not tag.attrs:
+                tag.extract()
+    
+    def _ensure_proper_structure(self, soup: BeautifulSoup) -> BeautifulSoup:
+        """确保HTML结构正确"""
+        # 创建新的soup对象
+        new_soup = BeautifulSoup('', 'html.parser')
+        
+        # 处理DOCTYPE
+        doctype = None
+        for item in soup.contents:
+            if isinstance(item, Doctype):
+                doctype = item
+                break
+        if doctype:
+            new_soup.append(doctype)
+        
+        # 创建html标签
+        html_tag = soup.new_tag('html')
+        new_soup.append(html_tag)
+        
+        # 处理head标签
+        head_tag = soup.find('head')
+        if head_tag:
+            html_tag.append(head_tag)
+        else:
+            head_tag = soup.new_tag('head')
+            html_tag.append(head_tag)
+        
+        # 处理body标签
+        body_tag = soup.find('body')
+        if body_tag:
+            html_tag.append(body_tag)
+        else:
+            body_tag = soup.new_tag('body')
+            html_tag.append(body_tag)
+        
+        # 将内容移动到body中
+        for element in soup.find_all(recursive=False):
+            if element.name not in ['html', 'head', 'body']:
+                element.extract()
+                body_tag.append(element)
+        
+        return new_soup
+    
+    def _process_unwrapped_content(self, soup: BeautifulSoup) -> None:
+        """处理未包裹的内容"""
+        body_tag = soup.body
+        if not body_tag:
+            if not soup.html:
+                soup = BeautifulSoup('<html><head></head><body></body></html>', 'html.parser')
+            else:
+                soup.html.append(soup.new_tag('body'))
+            body_tag = soup.body
+        
+        for element in list(soup.html.contents):
+            if element.name != 'body' and element != body_tag:
+                if element.name:
+                    element.extract()
+                    body_tag.append(element)
+                elif str(element).strip():
+                    text = str(element).strip()
+                    if not self._is_explanation_text(text):
+                        new_p = soup.new_tag('p')
+                        new_p.string = text
+                        element.replace_with(new_p)
+                        body_tag.append(new_p)
+                    else:
+                        element.extract()
+    
+    def process_file(self, file_path: Path) -> bool:
+        """处理单个HTML文件"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            # 清理内容
+            content = self._clean_content(content)
+            
+            # 解析HTML
+            soup = BeautifulSoup(content, 'html.parser')
+            
+            # 清理soup对象
+            self._clean_soup(soup)
+            
+            # 确保HTML结构正确
+            soup = self._ensure_proper_structure(soup)
+            
+            # 处理未包裹内容
+            self._process_unwrapped_content(soup)
+            
+            # 输出结果
+            result = self._clean_content(str(soup))
+            
+            # 写回文件
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(result)
+            
+            logger.info(f"成功处理: {file_path}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"处理失败 {file_path}: {str(e)}")
+            return False
+    
+    def process_directory(self, directory: str) -> None:
+        """处理目录中的所有HTML文件"""
+        directory_path = Path(directory)
+        if not directory_path.is_dir():
+            logger.error(f"错误: 目录不存在 - {directory}")
+            return
+        
+        html_files = list(directory_path.rglob("*.html"))
+        if not html_files:
+            logger.warning(f"未在目录中找到HTML文件: {directory}")
+            return
+        
+        success_count = 0
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = list(executor.map(self.process_file, html_files))
+            success_count = sum(1 for result in results if result)
+        
+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
+
+def main():
+    """主函数"""
+    import sys
+    if len(sys.argv) > 1:
+        target_directory = sys.argv[1]
+    else:
+        target_directory = input("请输入要处理的目录路径: ")
+    
+    processor = HTMLProcessor()
+    processor.process_directory(target_directory)
+
+if __name__ == "__main__":
+    main()

+ 118 - 0
003/process_html_file_v2.py

@@ -0,0 +1,118 @@
+
+from pathlib import Path
+from bs4 import BeautifulSoup
+import logging
+from concurrent.futures import ThreadPoolExecutor
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class HTMLProcessor:
+    """HTML文件处理器类"""
+    
+    def __init__(self, max_workers: int = 4):
+        self.max_workers = max_workers
+    
+    """ 过滤冗余文本 """
+    def _filter_redundant_text(self, file_path: str) -> str:
+        with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+        content = content.replace('\n</p>', '</p>')
+        content = content.replace('</body>', '\nb</body>')
+        content = content.replace('<h2', '\n<h2')
+        for text in content.split('\n'):
+            if text.find('p34"') == 0 :
+                content = content.replace(text, '<p class="'+text)
+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
+                content = content.replace(text, '')
+            if text.strip() == '':
+                content = content.replace(text, '')
+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
+                content = content.replace(text, '')
+        content = content.replace('<p', '\n\n<p')
+        # 写回文件
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        # content = content.replace('\n</p>', '</p>')
+        # return content
+    
+    def _clean_soup(self, soup: BeautifulSoup) -> None:
+        """清理BeautifulSoup对象中的冗余内容
+        
+        该方法会清理以下内容:
+        1. 移除所有HTML注释
+        2. 移除直接位于body或html标签下的纯文本节点
+        3. 保留所有被标签包裹的文本内容
+        
+        Args:
+            soup: BeautifulSoup对象
+        """
+        soup.html.attrs['xml:lang'] = 'zh-CN'
+        for element in soup.body.children:
+            print("\n\n<<<===========================================\n")
+            print(element, element.name);
+            if element.name == None:
+                # element.extract() 原来的地方使用 换行符号
+                element.replace_with('\n')
+            print("===========================================>>>\n\n")
+      
+    def process_file(self, file_path: Path) -> bool:
+        """处理单个HTML文件"""
+        try:
+            self._filter_redundant_text(file_path);
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            # 解析HTML
+            soup = BeautifulSoup(content, 'html.parser')
+            
+            # 清理soup对象
+            self._clean_soup(soup)
+            result = str(soup)
+            
+            # 写回文件
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(result)
+            
+            logger.info(f"成功处理: {file_path}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"处理失败 {file_path}: {str(e)}")
+            return False
+    
+    def process_directory(self, directory: str) -> None:
+        """处理目录中的所有HTML文件"""
+        directory_path = Path(directory)
+        if not directory_path.is_dir():
+            logger.error(f"错误: 目录不存在 - {directory}")
+            return
+        
+        html_files = list(directory_path.rglob("*.html"))
+        if not html_files:
+            logger.warning(f"未在目录中找到HTML文件: {directory}")
+            return
+        
+        success_count = 0
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = list(executor.map(self.process_file, html_files))
+            success_count = sum(1 for result in results if result)
+        
+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
+
+def main():
+    """主函数"""
+    # import sys
+    # if len(sys.argv) > 1:
+    #     target_directory = sys.argv[1]
+    # else:
+    #     target_directory = input("请输入要处理的目录路径: ")
+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
+    processor = HTMLProcessor()
+    processor.process_directory(target_directory)
+
+if __name__ == "__main__":
+    main()

+ 137 - 0
003/process_html_file_v3.py

@@ -0,0 +1,137 @@
+
+from pathlib import Path
+from bs4 import BeautifulSoup
+import logging
+from concurrent.futures import ThreadPoolExecutor
+import ast
+from lxml import etree
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class HTMLProcessor:
+    """HTML文件处理器类"""
+    
+    def __init__(self, max_workers: int = 4):
+        self.max_workers = max_workers
+    
+    """ 过滤冗余文本 """
+    def _filter_redundant_text(self, file_path: str) -> str:
+        with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+        content = content.replace('\n</p>', '</p>')
+        content = content.replace('</body>', '\nb</body>')
+        content = content.replace('<h2', '\n<h2')
+        for text in content.split('\n'):
+            if text.find('p34"') == 0 :
+                content = content.replace(text, '<p class="'+text)
+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
+                content = content.replace(text, '')
+            if text.strip() == '':
+                content = content.replace(text, '')
+            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
+                content = content.replace(text, '')
+        content = content.replace('<p', '\n\n<p')
+        # 写回文件
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        # content = content.replace('\n</p>', '</p>')
+        # return content
+    
+    def _clean_soup(self, soup: BeautifulSoup) -> None:
+        """清理BeautifulSoup对象中的冗余内容
+        
+        该方法会清理以下内容:
+        1. 移除所有HTML注释
+        2. 移除直接位于body或html标签下的纯文本节点
+        3. 保留所有被标签包裹的文本内容
+        
+        Args:
+            soup: BeautifulSoup对象
+        """
+        soup.html.attrs['xml:lang'] = 'zh-CN'
+        for element in soup.body.children:
+            print("\n\n<<<===========================================\n")
+            print(element, element.name);
+            if element.name == None:
+                # element.extract() 原来的地方使用 换行符号
+                element.replace_with('\n')
+            print("===========================================>>>\n\n")
+      
+    def process_file(self, file_path: Path) -> bool:
+        """处理单个HTML文件"""
+        try:
+            self.ast_html(file_path);
+            # self._filter_redundant_text(file_path);
+            # with open(file_path, 'r', encoding='utf-8') as f:
+            #     content = f.read()
+            
+            # # 解析HTML
+            # soup = BeautifulSoup(content, 'html.parser')
+            
+            # # 清理soup对象
+            # self._clean_soup(soup)
+            # result = str(soup)
+            
+            # # 写回文件
+            # with open(file_path, 'w', encoding='utf-8') as f:
+            #     f.write(result)
+            
+            logger.info(f"成功处理: {file_path}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"处理失败 {file_path}: {str(e)}")
+            return False
+    
+    def ast_html(self, file_path: Path) -> None:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        # 转换为字节
+        html_bytes = content.encode('utf-8')  # 明确指定编码
+        tree = etree.HTML(html_bytes)
+        body = tree.xpath('//body')[0]
+        print(body)
+        for element in body.iter():
+            print(element.tag, element.text)
+        # print(ast.literal_eval(content))
+        # print(ast.literal_eval(content))
+        # soup = BeautifulSoup(content, 'html.parser')
+        # return str(soup)
+    
+    def process_directory(self, directory: str) -> None:
+        """处理目录中的所有HTML文件"""
+        directory_path = Path(directory)
+        if not directory_path.is_dir():
+            logger.error(f"错误: 目录不存在 - {directory}")
+            return
+        
+        html_files = list(directory_path.rglob("*.html"))
+        if not html_files:
+            logger.warning(f"未在目录中找到HTML文件: {directory}")
+            return
+        
+        success_count = 0
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = list(executor.map(self.process_file, html_files))
+            success_count = sum(1 for result in results if result)
+        
+        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")
+
+def main():
+    """主函数"""
+    # import sys
+    # if len(sys.argv) > 1:
+    #     target_directory = sys.argv[1]
+    # else:
+    #     target_directory = input("请输入要处理的目录路径: ")
+    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
+    processor = HTMLProcessor()
+    processor.process_directory(target_directory)
+
+if __name__ == "__main__":
+    main()