root
/
english-to-chinese


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839
							import os
import re
import openai
import time
from tqdm import tqdm
import sqlite3
from datetime import datetime
import logging
from logging.handlers import RotatingFileHandler
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import asyncio
import yaml
import threading
from collections import deque

# 配置管理
class Config:
    def __init__(self, config_path='config.yaml'):
        self.config_path = config_path
        self.config = self.load_config()
        
        # 验证配置
        self.validate_config()
        
        # 设置日志
        self.setup_logging()
        
        # 初始化OpenAI客户端
        self.setup_openai()
    
    def validate_config(self):
        """验证配置项"""
        required_fields = {
            'logging': ['level', 'format', 'file'],
            'openai': ['base_url', 'api_key', 'model_name', 'max_retries', 'retry_delay', 'timeout', 'max_concurrent_requests'],
            'translation': ['min_line_count', 'max_line_count', 'initial_line_count', 'error_threshold', 'success_threshold', 'error_cooldown', 'cache_size'],
            'database': ['path', 'pool_size'],
            'paths': ['input_dir', 'output_dir']
        }
        
        for section, fields in required_fields.items():
            if section not in self.config:
                raise ValueError(f"缺少配置节: {section}")
            for field in fields:
                if field not in self.config[section]:
                    raise ValueError(f"缺少配置项: {section}.{field}")
    
    def load_config(self):
        """加载配置文件"""
        if not os.path.exists(self.config_path):
            # 创建默认配置
            default_config = {
                'logging': {
                    'level': 'INFO',
                    'format': '%(asctime)s - %(levelname)s - %(message)s',
                    'file': 'translation.log'
                },
                'openai': {
                    'base_url': 'https://api.siliconflow.cn/v1',
                    'api_key': 'sk-',
                    'model_name': 'deepseek-ai/DeepSeek-R1',
                    'max_retries': 3,
                    'retry_delay': 2,
                    'timeout': 30,
                    'max_concurrent_requests': 5
                },
                'translation': {
                    'min_line_count': 1,
                    'max_line_count': 5,
                    'initial_line_count': 2,
                    'error_threshold': 3,
                    'success_threshold': 5,
                    'error_cooldown': 60,
                    'cache_size': 1000
                },
                'database': {
                    'path': 'translation_progress.db',
                    'pool_size': 5
                },
                'paths': {
                    'input_dir': '002/Ops',
                    'output_dir': '002/Ops_translated'
                }
            }
            
            # 保存默认配置
            with open(self.config_path, 'w', encoding='utf-8') as f:
                yaml.dump(default_config, f, allow_unicode=True)
            
            return default_config
        
        # 加载现有配置
        with open(self.config_path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    
    def setup_logging(self):
        """设置日志"""
        log_file = self.config['logging']['file']
        log_dir = os.path.dirname(log_file)
        if log_dir and not os.path.exists(log_dir):
            os.makedirs(log_dir)
            
        # 创建日志处理器
        file_handler = RotatingFileHandler(
            log_file,
            maxBytes=10*1024*1024,  # 10MB
            backupCount=5,
            encoding='utf-8'
        )
        console_handler = logging.StreamHandler()
        
        # 设置日志格式
        formatter = logging.Formatter(self.config['logging']['format'])
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)
        
        # 配置根日志记录器
        root_logger = logging.getLogger()
        root_logger.setLevel(getattr(logging, self.config['logging']['level']))
        root_logger.addHandler(file_handler)
        root_logger.addHandler(console_handler)
    
    def setup_openai(self):
        """设置OpenAI客户端"""
        self.client = openai.OpenAI(
            base_url=self.config['openai']['base_url'],
            api_key=self.config['openai']['api_key']
        )
    
    def get(self, *keys):
        """获取配置值"""
        value = self.config
        for key in keys:
            value = value[key]
        return value
    
    def update(self, updates):
        """更新配置"""
        def deep_update(d, u):
            for k, v in u.items():
                if isinstance(v, dict):
                    d[k] = deep_update(d.get(k, {}), v)
                else:
                    d[k] = v
            return d
        
        self.config = deep_update(self.config, updates)
        
        # 保存更新后的配置
        with open(self.config_path, 'w', encoding='utf-8') as f:
            yaml.dump(self.config, f, allow_unicode=True)
        
        # 重新设置日志和OpenAI客户端
        self.setup_logging()
        self.setup_openai()


# 创建全局的配置实例
config = Config()

# 更新全局变量
MODEL_CONFIG = {
    "model_name": config.get('openai', 'model_name'),
    "max_retries": config.get('openai', 'max_retries'),
    "retry_delay": config.get('openai', 'retry_delay'),
    "timeout": config.get('openai', 'timeout'),
    "max_concurrent_requests": config.get('openai', 'max_concurrent_requests'),
    "cache_size": config.get('translation', 'cache_size')
}

MIN_LINE_COUNT = config.get('translation', 'min_line_count')
MAX_LINE_COUNT = config.get('translation', 'max_line_count')
INITIAL_LINE_COUNT = config.get('translation', 'initial_line_count')
ERROR_THRESHOLD = config.get('translation', 'error_threshold')
SUCCESS_THRESHOLD = config.get('translation', 'success_threshold')

# 更新其他类的初始化参数
class TranslationStats:
    def __init__(self):
        self.start_time = time.time()
        self.total_chars = 0
        self.translated_chars = 0
        self.total_requests = 0
        self.successful_requests = 0
        self.failed_requests = 0
    
    def update_stats(self, original_text, translated_text, success=True):
        self.total_chars += len(original_text)
        self.translated_chars += len(translated_text)
        self.total_requests += 1
        if success:
            self.successful_requests += 1
        else:
            self.failed_requests += 1
    
    def get_stats(self):
        elapsed_time = time.time() - self.start_time
        chars_per_second = self.translated_chars / elapsed_time if elapsed_time > 0 else 0
        success_rate = (self.successful_requests / self.total_requests * 100) if self.total_requests > 0 else 0
        
        return {
            "总字符数": self.total_chars,
            "已翻译字符数": self.translated_chars,
            "翻译速度": f"{chars_per_second:.2f} 字符/秒",
            "成功率": f"{success_rate:.1f}%",
            "总请求数": self.total_requests,
            "成功请求": self.successful_requests,
            "失败请求": self.failed_requests,
            "运行时间": f"{elapsed_time:.1f} 秒"
        }

# 创建全局的统计对象
translation_stats = TranslationStats()
class DatabaseManager:
    def __init__(self):
        self.db_path = config.get('database', 'path')
        self.conn = None
        self.batch_size = 100  # 批量更新的大小
        self.pending_updates = []  # 待更新的操作
        self.init_db()
    
    def get_connection(self):
        """获取数据库连接"""
        if self.conn is None:
            self.conn = sqlite3.connect(self.db_path)
            self.conn.row_factory = sqlite3.Row
            # 启用外键约束
            self.conn.execute("PRAGMA foreign_keys = ON")
            # 设置WAL模式提高并发性能
            self.conn.execute("PRAGMA journal_mode = WAL")
        return self.conn
    
    def close(self):
        """关闭数据库连接"""
        if self.conn:
            # 提交所有待处理的更新
            self.flush_updates()
            self.conn.close()
            self.conn = None
    
    def flush_updates(self):
        """提交所有待处理的更新"""
        if not self.pending_updates:
            return
            
        try:
            self.begin_transaction()
            for update in self.pending_updates:
                update()
            self.commit_transaction()
        except Exception as e:
            self.rollback_transaction()
            logging.error(f"批量更新失败: {str(e)}")
            raise
        finally:
            self.pending_updates = []
    
    def add_update(self, update_func):
        """添加待处理的更新操作"""
        self.pending_updates.append(update_func)
        if len(self.pending_updates) >= self.batch_size:
            self.flush_updates()
    
    def update_file_progress(self, file_path, total_lines, processed_lines, status):
        """更新文件翻译进度"""
        def update():
            c = self.get_connection().cursor()
            c.execute('''
                INSERT OR REPLACE INTO file_progress 
                (file_path, total_lines, processed_lines, status, last_updated)
                VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
            ''', (file_path, total_lines, processed_lines, status))
        
        self.add_update(update)
    
    def update_line_progress(self, file_path, line_index, original_text, translated_text, status):
        """更新行翻译进度"""
        def update():
            c = self.get_connection().cursor()
            c.execute('''
                INSERT OR REPLACE INTO line_progress 
                (file_path, line_index, original_text, translated_text, status, updated_at)
                VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
            ''', (file_path, line_index, original_text, translated_text, status))
        
        self.add_update(update)
    
    def update_group_progress(self, file_path, group_index, original_text, translated_text, status):
        """更新翻译组进度"""
        def update():
            c = self.get_connection().cursor()
            c.execute('''
                INSERT OR REPLACE INTO group_progress 
                (file_path, group_index, original_text, translated_text, status, version, updated_at)
                VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
            ''', (file_path, group_index, original_text, translated_text, status, VERSION))
        
        self.add_update(update)
    
    def log_error(self, file_path, line_index, error_type, error_message):
        """记录错误"""
        def update():
            c = self.get_connection().cursor()
            c.execute('''
                INSERT INTO error_log 
                (file_path, line_index, error_type, error_message)
                VALUES (?, ?, ?, ?)
            ''', (file_path, line_index, error_type, error_message))
        
        self.add_update(update)
    
    def init_db(self):
        """初始化数据库"""
        conn = self.get_connection()
        c = conn.cursor()
        
        # 创建文件进度表
        c.execute('''
            CREATE TABLE IF NOT EXISTS file_progress (
                file_path TEXT PRIMARY KEY,
                total_lines INTEGER,
                processed_lines INTEGER,
                status TEXT,
                last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                error_count INTEGER DEFAULT 0,
                retry_count INTEGER DEFAULT 0
            )
        ''')
        
        # 创建行进度表
        c.execute('''
            CREATE TABLE IF NOT EXISTS line_progress (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                file_path TEXT,
                line_index INTEGER,
                original_text TEXT,
                translated_text TEXT,
                status TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                error_count INTEGER DEFAULT 0,
                retry_count INTEGER DEFAULT 0,
                UNIQUE(file_path, line_index)
            )
        ''')
        
        # 创建错误日志表
        c.execute('''
            CREATE TABLE IF NOT EXISTS error_log (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                file_path TEXT,
                line_index INTEGER,
                error_type TEXT,
                error_message TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                resolved_at TIMESTAMP,
                resolution TEXT
            )
        ''')
        
        # 创建翻译组进度表
        c.execute('''
            CREATE TABLE IF NOT EXISTS group_progress (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                file_path TEXT,
                group_index INTEGER,
                original_text TEXT,
                translated_text TEXT,
                status TEXT,
                version TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                UNIQUE(file_path, group_index, version)
            )
        ''')
        
        conn.commit()
    
    def begin_transaction(self):
        """开始事务"""
        self.get_connection().execute('BEGIN TRANSACTION')
    
    def commit_transaction(self):
        """提交事务"""
        self.get_connection().commit()
    
    def rollback_transaction(self):
        """回滚事务"""
        self.get_connection().rollback()
    
    def get_file_progress(self, file_path):
        """获取文件翻译进度"""
        c = self.get_connection().cursor()
        c.execute('SELECT * FROM file_progress WHERE file_path = ?', (file_path,))
        return c.fetchone()
    
    def get_line_progress(self, file_path, line_index):
        """获取行翻译进度"""
        c = self.get_connection().cursor()
        c.execute('''
            SELECT * FROM line_progress 
            WHERE file_path = ? AND line_index = ?
        ''', (file_path, line_index))
        return c.fetchone()
    
    def get_error_stats(self):
        """获取错误统计信息"""
        c = self.get_connection().cursor()
        c.execute('''
            SELECT 
                COUNT(*) as total_errors,
                COUNT(CASE WHEN resolved_at IS NULL THEN 1 END) as unresolved_errors,
                COUNT(CASE WHEN created_at > datetime('now', '-1 hour') THEN 1 END) as recent_errors
            FROM error_log
        ''')
        return c.fetchone()


class AsyncTranslationManager:
    def __init__(self):
        self.semaphore = asyncio.Semaphore(config.get('openai', 'max_concurrent_requests'))
        self.session = None

class TranslationCache:
    def __init__(self):
        self.cache = {}
        self.max_size = config.get('translation', 'cache_size')
        self.hits = 0
        self.misses = 0

# 创建全局实例
line_count_manager = TranslationStats()
db_manager = DatabaseManager()
async_translation_manager = AsyncTranslationManager()
translation_cache = TranslationCache()

# 添加版本控制
VERSION = "1.0.1" # 版本号，用于区分不同版本的翻译
line_count = 4 # 每组行数，固定为4行一组

def get_completed_groups(conn, file_path):
    """获取已完成的翻译行"""
    c = conn.cursor()
    c.execute('''
        SELECT group_index, translated_text 
        FROM group_progress 
        WHERE file_path = ? AND status = 'completed' AND version = ?
        ORDER BY group_index
    ''', (file_path, VERSION))
    return c.fetchall()

class TokenBucket:
    """令牌桶限流器"""
    def __init__(self, rate, capacity):
        self.rate = rate  # 令牌产生速率（每秒）
        self.capacity = capacity  # 桶容量
        self.tokens = capacity  # 当前令牌数
        self.last_update = time.time()
        self.lock = threading.Lock()
    
    def get_token(self):
        """获取一个令牌"""
        with self.lock:
            now = time.time()
            # 计算新增的令牌
            new_tokens = (now - self.last_update) * self.rate
            self.tokens = min(self.capacity, self.tokens + new_tokens)
            self.last_update = now
            
            if self.tokens >= 1:
                self.tokens -= 1
                return True
            return False
    
    def wait_for_token(self):
        """等待直到获得令牌"""
        while not self.get_token():
            time.sleep(0.1)

# 创建全局的令牌桶实例
token_bucket = TokenBucket(rate=2, capacity=10)  # 每秒2个请求，最多10个并发

@retry(
    stop=stop_after_attempt(MODEL_CONFIG['max_retries']),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=retry_if_exception_type((openai.APIError, openai.APITimeoutError)),
    before_sleep=lambda retry_state: logging.warning(f"重试第 {retry_state.attempt_number} 次...")
)
def translate_text(text):
    """翻译文本，使用流式输出"""
    if not text or not text.strip():
        logging.warning("收到空文本，跳过翻译")
        return text
        
    try:
        # 等待获取令牌
        token_bucket.wait_for_token()
        
        messages = [
            {
                "role": "system",
                "content": "- 你名为epub翻译大师，专注于将任意语言的文本翻译成中文。- 你在翻译过程中，力求保留原文语意，确保翻译的准确性和完整性。- 你特别注重翻译结果要贴合现代人的阅读习惯，使译文更加流畅易懂。- 在处理包含代码结构的文本时，你会特别注意保持代码的原样。- 你的服务旨在为用户提供高效、便捷的翻译体验，帮助用户跨越语言障碍。- 在回答问题的时候，尽可能保留原来的代码结构。- 在回答问题的时候，尽可能只返回翻译后的内容和代码结构，不要返回任何其他内容。"
            },
            {
                "role": "user",
                "content": text
            }
        ]
        
        # 使用流式输出
        stream = config.client.chat.completions.create(
            model=MODEL_CONFIG['model_name'],
            messages=messages,
            timeout=MODEL_CONFIG['timeout'],
            stream=True,  # 启用流式输出
            temperature=0.3  # 降低随机性，使翻译更稳定
        )
        
        # 收集流式输出的内容
        translated_text = ""
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                content = chunk.choices[0].delta.content
                translated_text += content
                # 实时打印翻译内容
                print(content, end='', flush=True)
        
        print()  # 换行
        
        # 验证翻译结果
        if not translated_text or len(translated_text.strip()) == 0:
            raise ValueError("翻译结果为空")
            
        # 更新统计信息
        translation_stats.update_stats(text, translated_text, True)
        
        return translated_text
        
    except openai.APIError as e:
        logging.error(f"OpenAI API错误: {str(e)}")
        translation_stats.update_stats(text, "", False)
        raise
    except openai.APITimeoutError as e:
        logging.error(f"OpenAI API超时: {str(e)}")
        translation_stats.update_stats(text, "", False)
        raise
    except Exception as e:
        logging.error(f"翻译出错: {str(e)}")
        translation_stats.update_stats(text, "", False)
        raise

def calculate_group_size(text_length):
    """根据文本长度动态计算分组大小"""
    if text_length < 1000:
        return 4
    elif text_length < 2000:
        return 3
    else:
        return 2

def resume_translation(file_path, db_manager):
    """获取断点续传的起始位置"""
    progress = db_manager.get_file_progress(file_path)
    if progress and progress['status'] == 'interrupted':
        return progress['processed_lines']
    return 0

def process_html_file(file_path, conn):
    """处理HTML文件"""
    # 检查文件进度
    progress = db_manager.get_file_progress(file_path)
    
    try:
        # 尝试不同的编码方式读取文件
        encodings = ['utf-8', 'gbk', 'gb2312', 'latin1']
        content = None
        
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                logging.info(f"成功使用 {encoding} 编码读取文件: {file_path}")
                break
            except UnicodeDecodeError:
                continue
        
        if content is None:
            raise Exception(f"无法使用支持的编码读取文件: {file_path}")
    
        # 使用正则表达式提取body标签内的内容和title标签
        body_pattern = re.compile(r'<body[^>]*>(.*?)</body>', re.DOTALL)
        title_pattern = re.compile(r'<title>(.*?)</title>', re.DOTALL)
        
        body_match = body_pattern.search(content)
        title_match = title_pattern.search(content)
        
        if not body_match:
            logging.warning(f"警告: {file_path} 中没有找到body标签")
            return
        
        body_content = body_match.group(1)
        
        # 处理title标签
        if title_match:
            title_content = title_match.group(1).strip()
            if title_content:  # 只有当标题内容不为空时才处理
                logging.info(f"开始翻译标题: {title_content}")
                translated_title = translate_text(title_content)
                # 替换原始title内容
                content = content.replace(f"<title>{title_content}</title>", f"<title>{translated_title}</title>")
                logging.info(f"标题翻译完成: {translated_title}")
            else:
                logging.info("跳过空标题")
        
        # 按行分割内容，保留所有非空行
        lines = [line.strip() for line in body_content.split('\n') if line.strip()]
        
        total_lines = len(lines)
        logging.info(f"文件 {file_path} 共有 {total_lines} 行需要处理")
        
        # 获取已完成的翻译
        completed_lines = get_completed_groups(conn, file_path)
        completed_indices = {line[0] for line in completed_lines}
        
        # 获取断点续传位置
        start_line = resume_translation(file_path, db_manager)
        
        # 计算已处理的进度
        if progress:
            progress_percentage = round(progress['processed_lines']*100/progress['total_lines'], 2)
            logging.info(f"文件 {file_path} 已处理进度: {progress['processed_lines']}/{progress['total_lines']} 行 ({progress_percentage}%)")
        
        # 逐行处理内容
        translated_lines = []
        try:
            with tqdm(range(start_line, len(lines)), desc=f"处理文件 {os.path.basename(file_path)}", unit="行") as pbar:
                for i in range(start_line, len(lines)):
                    # 计算当前组的大小
                    current_group_size = calculate_group_size(len(lines[i]))
                    group_index = i // current_group_size
                    
                    # 检查是否已完成
                    if group_index in completed_indices:
                        # 使用已完成的翻译
                        for line in completed_lines:
                            if line[0] == group_index:
                                translated_lines.extend(line[1].split('\n'))
                                break
                        pbar.update(current_group_size)
                        continue
                    
                    # 获取当前组的行
                    group = lines[i:i+current_group_size]
                    if group:
                        try:
                            # 收集需要翻译的段落
                            paragraphs_to_translate = []
                            paragraph_indices = []
                            for idx, line in enumerate(group):
                                if '<p class' in line or line.startswith('<h'):
                                    paragraphs_to_translate.append(line)
                                    paragraph_indices.append(idx)
                            
                            # 如果有需要翻译的段落，进行翻译
                            if paragraphs_to_translate:
                                # 将所有需要翻译的段落合并成一个文本
                                combined_text = "\n".join(paragraphs_to_translate)
                                logging.info(f"开始翻译第 {i+1}-{min(i+current_group_size, len(lines))} 行")
                                translated_text = translate_text(combined_text)
                                
                                # 分割翻译后的文本
                                translated_paragraphs = translated_text.split('\n')
                                
                                # 将翻译后的段落放回原位置
                                translated_group = group.copy()
                                for idx, translated in zip(paragraph_indices, translated_paragraphs):
                                    translated_group[idx] = translated
                            else:
                                translated_group = group
                            
                            # 保存原始文本和翻译后的文本
                            original_text = "\n".join(group)
                            translated_text = "\n".join(translated_group)
                            
                            # 更新翻译组进度
                            db_manager.update_group_progress(file_path, group_index, original_text, translated_text, 'completed')
                            
                            # 分割翻译后的文本
                            translated_lines.extend(translated_group)
                            
                            # 更新文件进度
                            processed_lines = min((group_index + 1) * current_group_size, total_lines)
                            db_manager.update_file_progress(file_path, total_lines, processed_lines, 'in_progress')
                            
                            # 显示当前统计信息
                            stats = translation_stats.get_stats()
                            pbar.set_postfix(stats)
                            
                            # 添加较小的延迟以避免API限制
                            time.sleep(0.1)
                            
                        except Exception as e:
                            logging.error(f"处理组 {group_index} 时出错: {str(e)}")
                            # 记录错误但继续处理
                            db_manager.log_error(file_path, group_index, "group_processing_error", str(e))
                            continue
                    
                    pbar.update(current_group_size)
            
            # 替换原始内容
            if translated_lines:
                # 构建新的body内容
                new_body_content = []
                current_index = 0
                
                # 遍历原始内容，替换需要翻译的部分
                for line in body_content.split('\n'):
                    line = line.strip()
                    if not line:
                        new_body_content.append('')
                        continue
                        
                    if line.startswith('<'):
                        if ('<p class' in line or line.startswith('<h')) and current_index < len(translated_lines):
                            # 替换翻译后的内容
                            new_body_content.append(translated_lines[current_index])
                            current_index += 1
                        else:
                            # 保持原样
                            new_body_content.append(line)
                    else:
                        # 保持非HTML内容原样
                        new_body_content.append(line)
                
                # 将新内容重新组合
                new_body_content = '\n'.join(new_body_content)
                
                # 替换原始内容中的body部分
                new_content = content.replace(body_content, new_body_content)
                
                # 保存修改后的文件
                output_dir = config.get('paths', 'output_dir')
                os.makedirs(output_dir, exist_ok=True)
                output_path = os.path.join(output_dir, os.path.basename(file_path))
                
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(new_content)
                
                # 更新完成状态
                db_manager.update_file_progress(file_path, total_lines, total_lines, 'completed')
                logging.info(f"文件 {file_path} 翻译完成，已保存到 {output_path}")
                
                # 显示最终统计信息
                logging.info("\n翻译统计信息:")
                for key, value in translation_stats.get_stats().items():
                    logging.info(f"{key}: {value}")
                
        except KeyboardInterrupt:
            logging.warning("\n检测到中断，保存当前进度...")
            if 'processed_lines' in locals():
                db_manager.update_file_progress(file_path, total_lines, processed_lines, 'interrupted')
            # 显示中断时的统计信息
            logging.info("\n中断时的统计信息:")
            for key, value in translation_stats.get_stats().items():
                logging.info(f"{key}: {value}")
            raise
        except Exception as e:
            logging.error(f"处理文件时出错: {str(e)}")
            if 'processed_lines' in locals():
                db_manager.update_file_progress(file_path, total_lines, processed_lines, 'error')
            raise
            
    except Exception as e:
        logging.error(f"读取文件时出错: {str(e)}")
        return

def main():
    ops_dir = config.get('paths', 'input_dir')
    html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
    
    # 按文件名排序
    html_files.sort()
    
    total_files = len(html_files)
    print(f"找到 {total_files} 个HTML文件需要处理")
    print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # 初始化数据库连接
    conn = db_manager.get_connection()
    
    try:
        for file_index, filename in enumerate(html_files, 1):
            file_path = os.path.join(ops_dir, filename)
            print(f"\n开始处理第 {file_index}/{total_files} 个文件: {filename}")
            print("-" * 50)
            
            # 检查文件是否已完成
            progress = db_manager.get_file_progress(file_path)
            if progress and progress['status'] == 'completed':
                print(f"文件 {filename} 已经完成翻译，跳过")
                continue
            
            try:
                process_html_file(file_path, conn)
                print(f"\n完成第 {file_index}/{total_files} 个文件: {filename}")
                print("-" * 50)
            except Exception as e:
                print(f"\n处理文件 {filename} 时出错: {str(e)}")
                print("继续处理下一个文件...")
                continue
            
            # 显示当前总体进度
            completed_files = sum(1 for f in html_files[:file_index] 
                                if db_manager.get_file_progress(os.path.join(ops_dir, f)) and 
                                db_manager.get_file_progress(os.path.join(ops_dir, f))['status'] == 'completed')
            print(f"\n总体进度: {completed_files}/{total_files} 个文件完成 "
                  f"({round(completed_files*100/total_files, 2)}%)")
            
            # 显示统计信息
            print("\n当前统计信息:")
            for key, value in translation_stats.get_stats().items():
                print(f"{key}: {value}")
            
            # 在文件之间添加短暂延迟
            if file_index < total_files:
                print("\n等待 5 秒后处理下一个文件...")
                time.sleep(5)
    
    except KeyboardInterrupt:
        print("\n程序被用户中断")
    finally:
        db_manager.close()
        print(f"\n结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("\n最终统计信息:")
        for key, value in translation_stats.get_stats().items():
            print(f"{key}: {value}")

if __name__ == "__main__":
    main()