root
/
english-to-chinese


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
							
from pathlib import Path
from bs4 import BeautifulSoup
import logging
from concurrent.futures import ThreadPoolExecutor
import ast
from lxml import etree

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class HTMLProcessor:
    """HTML文件处理器类"""
    
    def __init__(self, max_workers: int = 4):
        self.max_workers = max_workers
    
    """ 过滤冗余文本 """
    def _filter_redundant_text(self, file_path: str) -> str:
        with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
        content = content.replace('\n</p>', '</p>')
        content = content.replace('</body>', '\nb</body>')
        content = content.replace('<h2', '\n<h2')
        for text in content.split('\n'):
            if text.find('p34"') == 0 :
                content = content.replace(text, '<p class="'+text)
            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
                content = content.replace(text, '')
            if text.strip() == '':
                content = content.replace(text, '')
            if text.find('代码结构') != -1 and text.find('完整保留未作改动') != -1:
                content = content.replace(text, '')
        content = content.replace('<p', '\n\n<p')
        # 写回文件
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
        # content = content.replace('\n</p>', '</p>')
        # return content
    
    def _clean_soup(self, soup: BeautifulSoup) -> None:
        """清理BeautifulSoup对象中的冗余内容
        
        该方法会清理以下内容：
        1. 移除所有HTML注释
        2. 移除直接位于body或html标签下的纯文本节点
        3. 保留所有被标签包裹的文本内容
        
        Args:
            soup: BeautifulSoup对象
        """
        soup.html.attrs['xml:lang'] = 'zh-CN'
        for element in soup.body.children:
            print("\n\n<<<===========================================\n")
            print(element, element.name);
            if element.name == None:
                # element.extract() 原来的地方使用 换行符号
                element.replace_with('\n')
            print("===========================================>>>\n\n")
      
    def process_file(self, file_path: Path) -> bool:
        """处理单个HTML文件"""
        try:
            self.ast_html(file_path);
            # self._filter_redundant_text(file_path);
            # with open(file_path, 'r', encoding='utf-8') as f:
            #     content = f.read()
            
            # # 解析HTML
            # soup = BeautifulSoup(content, 'html.parser')
            
            # # 清理soup对象
            # self._clean_soup(soup)
            # result = str(soup)
            
            # # 写回文件
            # with open(file_path, 'w', encoding='utf-8') as f:
            #     f.write(result)
            
            logger.info(f"成功处理: {file_path}")
            return True
            
        except Exception as e:
            logger.error(f"处理失败 {file_path}: {str(e)}")
            return False
    
    def ast_html(self, file_path: Path) -> None:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        # 转换为字节
        html_bytes = content.encode('utf-8')  # 明确指定编码
        tree = etree.HTML(html_bytes)
        body = tree.xpath('//body')[0]
        print(body)
        for element in body.iter():
            print(element.tag, element.text)
        # print(ast.literal_eval(content))
        # print(ast.literal_eval(content))
        # soup = BeautifulSoup(content, 'html.parser')
        # return str(soup)
    
    def process_directory(self, directory: str) -> None:
        """处理目录中的所有HTML文件"""
        directory_path = Path(directory)
        if not directory_path.is_dir():
            logger.error(f"错误: 目录不存在 - {directory}")
            return
        
        html_files = list(directory_path.rglob("*.html"))
        if not html_files:
            logger.warning(f"未在目录中找到HTML文件: {directory}")
            return
        
        success_count = 0
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(executor.map(self.process_file, html_files))
            success_count = sum(1 for result in results if result)
        
        logger.info(f"处理完成! 成功处理 {success_count}/{len(html_files)} 个文件")

def main():
    """主函数"""
    # import sys
    # if len(sys.argv) > 1:
    #     target_directory = sys.argv[1]
    # else:
    #     target_directory = input("请输入要处理的目录路径: ")
    target_directory="/Users/sysadmin/Desktop/code/wb_projec/english-to-chinese/003/Ops"
    processor = HTMLProcessor()
    processor.process_directory(target_directory)

if __name__ == "__main__":
    main()