translate_epub_v4(单线程版本)V2.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839
  1. import os
  2. import re
  3. import openai
  4. import time
  5. from tqdm import tqdm
  6. import sqlite3
  7. from datetime import datetime
  8. import logging
  9. from logging.handlers import RotatingFileHandler
  10. from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
  11. import asyncio
  12. import yaml
  13. import threading
  14. from collections import deque
  15. # 配置管理
  16. class Config:
  17. def __init__(self, config_path='config.yaml'):
  18. self.config_path = config_path
  19. self.config = self.load_config()
  20. # 验证配置
  21. self.validate_config()
  22. # 设置日志
  23. self.setup_logging()
  24. # 初始化OpenAI客户端
  25. self.setup_openai()
  26. def validate_config(self):
  27. """验证配置项"""
  28. required_fields = {
  29. 'logging': ['level', 'format', 'file'],
  30. 'openai': ['base_url', 'api_key', 'model_name', 'max_retries', 'retry_delay', 'timeout', 'max_concurrent_requests'],
  31. 'translation': ['min_line_count', 'max_line_count', 'initial_line_count', 'error_threshold', 'success_threshold', 'error_cooldown', 'cache_size'],
  32. 'database': ['path', 'pool_size'],
  33. 'paths': ['input_dir', 'output_dir']
  34. }
  35. for section, fields in required_fields.items():
  36. if section not in self.config:
  37. raise ValueError(f"缺少配置节: {section}")
  38. for field in fields:
  39. if field not in self.config[section]:
  40. raise ValueError(f"缺少配置项: {section}.{field}")
  41. def load_config(self):
  42. """加载配置文件"""
  43. if not os.path.exists(self.config_path):
  44. # 创建默认配置
  45. default_config = {
  46. 'logging': {
  47. 'level': 'INFO',
  48. 'format': '%(asctime)s - %(levelname)s - %(message)s',
  49. 'file': 'translation.log'
  50. },
  51. 'openai': {
  52. 'base_url': 'https://api.siliconflow.cn/v1',
  53. 'api_key': 'sk-',
  54. 'model_name': 'deepseek-ai/DeepSeek-R1',
  55. 'max_retries': 3,
  56. 'retry_delay': 2,
  57. 'timeout': 30,
  58. 'max_concurrent_requests': 5
  59. },
  60. 'translation': {
  61. 'min_line_count': 1,
  62. 'max_line_count': 5,
  63. 'initial_line_count': 2,
  64. 'error_threshold': 3,
  65. 'success_threshold': 5,
  66. 'error_cooldown': 60,
  67. 'cache_size': 1000
  68. },
  69. 'database': {
  70. 'path': 'translation_progress.db',
  71. 'pool_size': 5
  72. },
  73. 'paths': {
  74. 'input_dir': '002/Ops',
  75. 'output_dir': '002/Ops_translated'
  76. }
  77. }
  78. # 保存默认配置
  79. with open(self.config_path, 'w', encoding='utf-8') as f:
  80. yaml.dump(default_config, f, allow_unicode=True)
  81. return default_config
  82. # 加载现有配置
  83. with open(self.config_path, 'r', encoding='utf-8') as f:
  84. return yaml.safe_load(f)
  85. def setup_logging(self):
  86. """设置日志"""
  87. log_file = self.config['logging']['file']
  88. log_dir = os.path.dirname(log_file)
  89. if log_dir and not os.path.exists(log_dir):
  90. os.makedirs(log_dir)
  91. # 创建日志处理器
  92. file_handler = RotatingFileHandler(
  93. log_file,
  94. maxBytes=10*1024*1024, # 10MB
  95. backupCount=5,
  96. encoding='utf-8'
  97. )
  98. console_handler = logging.StreamHandler()
  99. # 设置日志格式
  100. formatter = logging.Formatter(self.config['logging']['format'])
  101. file_handler.setFormatter(formatter)
  102. console_handler.setFormatter(formatter)
  103. # 配置根日志记录器
  104. root_logger = logging.getLogger()
  105. root_logger.setLevel(getattr(logging, self.config['logging']['level']))
  106. root_logger.addHandler(file_handler)
  107. root_logger.addHandler(console_handler)
  108. def setup_openai(self):
  109. """设置OpenAI客户端"""
  110. self.client = openai.OpenAI(
  111. base_url=self.config['openai']['base_url'],
  112. api_key=self.config['openai']['api_key']
  113. )
  114. def get(self, *keys):
  115. """获取配置值"""
  116. value = self.config
  117. for key in keys:
  118. value = value[key]
  119. return value
  120. def update(self, updates):
  121. """更新配置"""
  122. def deep_update(d, u):
  123. for k, v in u.items():
  124. if isinstance(v, dict):
  125. d[k] = deep_update(d.get(k, {}), v)
  126. else:
  127. d[k] = v
  128. return d
  129. self.config = deep_update(self.config, updates)
  130. # 保存更新后的配置
  131. with open(self.config_path, 'w', encoding='utf-8') as f:
  132. yaml.dump(self.config, f, allow_unicode=True)
  133. # 重新设置日志和OpenAI客户端
  134. self.setup_logging()
  135. self.setup_openai()
  136. # 创建全局的配置实例
  137. config = Config()
  138. # 更新全局变量
  139. MODEL_CONFIG = {
  140. "model_name": config.get('openai', 'model_name'),
  141. "max_retries": config.get('openai', 'max_retries'),
  142. "retry_delay": config.get('openai', 'retry_delay'),
  143. "timeout": config.get('openai', 'timeout'),
  144. "max_concurrent_requests": config.get('openai', 'max_concurrent_requests'),
  145. "cache_size": config.get('translation', 'cache_size')
  146. }
  147. MIN_LINE_COUNT = config.get('translation', 'min_line_count')
  148. MAX_LINE_COUNT = config.get('translation', 'max_line_count')
  149. INITIAL_LINE_COUNT = config.get('translation', 'initial_line_count')
  150. ERROR_THRESHOLD = config.get('translation', 'error_threshold')
  151. SUCCESS_THRESHOLD = config.get('translation', 'success_threshold')
  152. # 更新其他类的初始化参数
  153. class TranslationStats:
  154. def __init__(self):
  155. self.start_time = time.time()
  156. self.total_chars = 0
  157. self.translated_chars = 0
  158. self.total_requests = 0
  159. self.successful_requests = 0
  160. self.failed_requests = 0
  161. def update_stats(self, original_text, translated_text, success=True):
  162. self.total_chars += len(original_text)
  163. self.translated_chars += len(translated_text)
  164. self.total_requests += 1
  165. if success:
  166. self.successful_requests += 1
  167. else:
  168. self.failed_requests += 1
  169. def get_stats(self):
  170. elapsed_time = time.time() - self.start_time
  171. chars_per_second = self.translated_chars / elapsed_time if elapsed_time > 0 else 0
  172. success_rate = (self.successful_requests / self.total_requests * 100) if self.total_requests > 0 else 0
  173. return {
  174. "总字符数": self.total_chars,
  175. "已翻译字符数": self.translated_chars,
  176. "翻译速度": f"{chars_per_second:.2f} 字符/秒",
  177. "成功率": f"{success_rate:.1f}%",
  178. "总请求数": self.total_requests,
  179. "成功请求": self.successful_requests,
  180. "失败请求": self.failed_requests,
  181. "运行时间": f"{elapsed_time:.1f} 秒"
  182. }
  183. # 创建全局的统计对象
  184. translation_stats = TranslationStats()
  185. class DatabaseManager:
  186. def __init__(self):
  187. self.db_path = config.get('database', 'path')
  188. self.conn = None
  189. self.batch_size = 100 # 批量更新的大小
  190. self.pending_updates = [] # 待更新的操作
  191. self.init_db()
  192. def get_connection(self):
  193. """获取数据库连接"""
  194. if self.conn is None:
  195. self.conn = sqlite3.connect(self.db_path)
  196. self.conn.row_factory = sqlite3.Row
  197. # 启用外键约束
  198. self.conn.execute("PRAGMA foreign_keys = ON")
  199. # 设置WAL模式提高并发性能
  200. self.conn.execute("PRAGMA journal_mode = WAL")
  201. return self.conn
  202. def close(self):
  203. """关闭数据库连接"""
  204. if self.conn:
  205. # 提交所有待处理的更新
  206. self.flush_updates()
  207. self.conn.close()
  208. self.conn = None
  209. def flush_updates(self):
  210. """提交所有待处理的更新"""
  211. if not self.pending_updates:
  212. return
  213. try:
  214. self.begin_transaction()
  215. for update in self.pending_updates:
  216. update()
  217. self.commit_transaction()
  218. except Exception as e:
  219. self.rollback_transaction()
  220. logging.error(f"批量更新失败: {str(e)}")
  221. raise
  222. finally:
  223. self.pending_updates = []
  224. def add_update(self, update_func):
  225. """添加待处理的更新操作"""
  226. self.pending_updates.append(update_func)
  227. if len(self.pending_updates) >= self.batch_size:
  228. self.flush_updates()
  229. def update_file_progress(self, file_path, total_lines, processed_lines, status):
  230. """更新文件翻译进度"""
  231. def update():
  232. c = self.get_connection().cursor()
  233. c.execute('''
  234. INSERT OR REPLACE INTO file_progress
  235. (file_path, total_lines, processed_lines, status, last_updated)
  236. VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
  237. ''', (file_path, total_lines, processed_lines, status))
  238. self.add_update(update)
  239. def update_line_progress(self, file_path, line_index, original_text, translated_text, status):
  240. """更新行翻译进度"""
  241. def update():
  242. c = self.get_connection().cursor()
  243. c.execute('''
  244. INSERT OR REPLACE INTO line_progress
  245. (file_path, line_index, original_text, translated_text, status, updated_at)
  246. VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  247. ''', (file_path, line_index, original_text, translated_text, status))
  248. self.add_update(update)
  249. def update_group_progress(self, file_path, group_index, original_text, translated_text, status):
  250. """更新翻译组进度"""
  251. def update():
  252. c = self.get_connection().cursor()
  253. c.execute('''
  254. INSERT OR REPLACE INTO group_progress
  255. (file_path, group_index, original_text, translated_text, status, version, updated_at)
  256. VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  257. ''', (file_path, group_index, original_text, translated_text, status, VERSION))
  258. self.add_update(update)
  259. def log_error(self, file_path, line_index, error_type, error_message):
  260. """记录错误"""
  261. def update():
  262. c = self.get_connection().cursor()
  263. c.execute('''
  264. INSERT INTO error_log
  265. (file_path, line_index, error_type, error_message)
  266. VALUES (?, ?, ?, ?)
  267. ''', (file_path, line_index, error_type, error_message))
  268. self.add_update(update)
  269. def init_db(self):
  270. """初始化数据库"""
  271. conn = self.get_connection()
  272. c = conn.cursor()
  273. # 创建文件进度表
  274. c.execute('''
  275. CREATE TABLE IF NOT EXISTS file_progress (
  276. file_path TEXT PRIMARY KEY,
  277. total_lines INTEGER,
  278. processed_lines INTEGER,
  279. status TEXT,
  280. last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  281. error_count INTEGER DEFAULT 0,
  282. retry_count INTEGER DEFAULT 0
  283. )
  284. ''')
  285. # 创建行进度表
  286. c.execute('''
  287. CREATE TABLE IF NOT EXISTS line_progress (
  288. id INTEGER PRIMARY KEY AUTOINCREMENT,
  289. file_path TEXT,
  290. line_index INTEGER,
  291. original_text TEXT,
  292. translated_text TEXT,
  293. status TEXT,
  294. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  295. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  296. error_count INTEGER DEFAULT 0,
  297. retry_count INTEGER DEFAULT 0,
  298. UNIQUE(file_path, line_index)
  299. )
  300. ''')
  301. # 创建错误日志表
  302. c.execute('''
  303. CREATE TABLE IF NOT EXISTS error_log (
  304. id INTEGER PRIMARY KEY AUTOINCREMENT,
  305. file_path TEXT,
  306. line_index INTEGER,
  307. error_type TEXT,
  308. error_message TEXT,
  309. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  310. resolved_at TIMESTAMP,
  311. resolution TEXT
  312. )
  313. ''')
  314. # 创建翻译组进度表
  315. c.execute('''
  316. CREATE TABLE IF NOT EXISTS group_progress (
  317. id INTEGER PRIMARY KEY AUTOINCREMENT,
  318. file_path TEXT,
  319. group_index INTEGER,
  320. original_text TEXT,
  321. translated_text TEXT,
  322. status TEXT,
  323. version TEXT,
  324. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  325. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  326. UNIQUE(file_path, group_index, version)
  327. )
  328. ''')
  329. conn.commit()
  330. def begin_transaction(self):
  331. """开始事务"""
  332. self.get_connection().execute('BEGIN TRANSACTION')
  333. def commit_transaction(self):
  334. """提交事务"""
  335. self.get_connection().commit()
  336. def rollback_transaction(self):
  337. """回滚事务"""
  338. self.get_connection().rollback()
  339. def get_file_progress(self, file_path):
  340. """获取文件翻译进度"""
  341. c = self.get_connection().cursor()
  342. c.execute('SELECT * FROM file_progress WHERE file_path = ?', (file_path,))
  343. return c.fetchone()
  344. def get_line_progress(self, file_path, line_index):
  345. """获取行翻译进度"""
  346. c = self.get_connection().cursor()
  347. c.execute('''
  348. SELECT * FROM line_progress
  349. WHERE file_path = ? AND line_index = ?
  350. ''', (file_path, line_index))
  351. return c.fetchone()
  352. def get_error_stats(self):
  353. """获取错误统计信息"""
  354. c = self.get_connection().cursor()
  355. c.execute('''
  356. SELECT
  357. COUNT(*) as total_errors,
  358. COUNT(CASE WHEN resolved_at IS NULL THEN 1 END) as unresolved_errors,
  359. COUNT(CASE WHEN created_at > datetime('now', '-1 hour') THEN 1 END) as recent_errors
  360. FROM error_log
  361. ''')
  362. return c.fetchone()
  363. class AsyncTranslationManager:
  364. def __init__(self):
  365. self.semaphore = asyncio.Semaphore(config.get('openai', 'max_concurrent_requests'))
  366. self.session = None
  367. class TranslationCache:
  368. def __init__(self):
  369. self.cache = {}
  370. self.max_size = config.get('translation', 'cache_size')
  371. self.hits = 0
  372. self.misses = 0
  373. # 创建全局实例
  374. line_count_manager = TranslationStats()
  375. db_manager = DatabaseManager()
  376. async_translation_manager = AsyncTranslationManager()
  377. translation_cache = TranslationCache()
  378. # 添加版本控制
  379. VERSION = "1.0.1" # 版本号,用于区分不同版本的翻译
  380. line_count = 4 # 每组行数,固定为4行一组
  381. def get_completed_groups(conn, file_path):
  382. """获取已完成的翻译行"""
  383. c = conn.cursor()
  384. c.execute('''
  385. SELECT group_index, translated_text
  386. FROM group_progress
  387. WHERE file_path = ? AND status = 'completed' AND version = ?
  388. ORDER BY group_index
  389. ''', (file_path, VERSION))
  390. return c.fetchall()
  391. class TokenBucket:
  392. """令牌桶限流器"""
  393. def __init__(self, rate, capacity):
  394. self.rate = rate # 令牌产生速率(每秒)
  395. self.capacity = capacity # 桶容量
  396. self.tokens = capacity # 当前令牌数
  397. self.last_update = time.time()
  398. self.lock = threading.Lock()
  399. def get_token(self):
  400. """获取一个令牌"""
  401. with self.lock:
  402. now = time.time()
  403. # 计算新增的令牌
  404. new_tokens = (now - self.last_update) * self.rate
  405. self.tokens = min(self.capacity, self.tokens + new_tokens)
  406. self.last_update = now
  407. if self.tokens >= 1:
  408. self.tokens -= 1
  409. return True
  410. return False
  411. def wait_for_token(self):
  412. """等待直到获得令牌"""
  413. while not self.get_token():
  414. time.sleep(0.1)
  415. # 创建全局的令牌桶实例
  416. token_bucket = TokenBucket(rate=2, capacity=10) # 每秒2个请求,最多10个并发
  417. @retry(
  418. stop=stop_after_attempt(MODEL_CONFIG['max_retries']),
  419. wait=wait_exponential(multiplier=1, min=4, max=10),
  420. retry=retry_if_exception_type((openai.APIError, openai.APITimeoutError)),
  421. before_sleep=lambda retry_state: logging.warning(f"重试第 {retry_state.attempt_number} 次...")
  422. )
  423. def translate_text(text):
  424. """翻译文本,使用流式输出"""
  425. if not text or not text.strip():
  426. logging.warning("收到空文本,跳过翻译")
  427. return text
  428. try:
  429. # 等待获取令牌
  430. token_bucket.wait_for_token()
  431. messages = [
  432. {
  433. "role": "system",
  434. "content": "- 你名为epub翻译大师,专注于将任意语言的文本翻译成中文。- 你在翻译过程中,力求保留原文语意,确保翻译的准确性和完整性。- 你特别注重翻译结果要贴合现代人的阅读习惯,使译文更加流畅易懂。- 在处理包含代码结构的文本时,你会特别注意保持代码的原样。- 你的服务旨在为用户提供高效、便捷的翻译体验,帮助用户跨越语言障碍。- 在回答问题的时候,尽可能保留原来的代码结构。- 在回答问题的时候,尽可能只返回翻译后的内容和代码结构,不要返回任何其他内容。"
  435. },
  436. {
  437. "role": "user",
  438. "content": text
  439. }
  440. ]
  441. # 使用流式输出
  442. stream = config.client.chat.completions.create(
  443. model=MODEL_CONFIG['model_name'],
  444. messages=messages,
  445. timeout=MODEL_CONFIG['timeout'],
  446. stream=True, # 启用流式输出
  447. temperature=0.3 # 降低随机性,使翻译更稳定
  448. )
  449. # 收集流式输出的内容
  450. translated_text = ""
  451. for chunk in stream:
  452. if chunk.choices[0].delta.content is not None:
  453. content = chunk.choices[0].delta.content
  454. translated_text += content
  455. # 实时打印翻译内容
  456. print(content, end='', flush=True)
  457. print() # 换行
  458. # 验证翻译结果
  459. if not translated_text or len(translated_text.strip()) == 0:
  460. raise ValueError("翻译结果为空")
  461. # 更新统计信息
  462. translation_stats.update_stats(text, translated_text, True)
  463. return translated_text
  464. except openai.APIError as e:
  465. logging.error(f"OpenAI API错误: {str(e)}")
  466. translation_stats.update_stats(text, "", False)
  467. raise
  468. except openai.APITimeoutError as e:
  469. logging.error(f"OpenAI API超时: {str(e)}")
  470. translation_stats.update_stats(text, "", False)
  471. raise
  472. except Exception as e:
  473. logging.error(f"翻译出错: {str(e)}")
  474. translation_stats.update_stats(text, "", False)
  475. raise
  476. def calculate_group_size(text_length):
  477. """根据文本长度动态计算分组大小"""
  478. if text_length < 1000:
  479. return 4
  480. elif text_length < 2000:
  481. return 3
  482. else:
  483. return 2
  484. def resume_translation(file_path, db_manager):
  485. """获取断点续传的起始位置"""
  486. progress = db_manager.get_file_progress(file_path)
  487. if progress and progress['status'] == 'interrupted':
  488. return progress['processed_lines']
  489. return 0
  490. def process_html_file(file_path, conn):
  491. """处理HTML文件"""
  492. # 检查文件进度
  493. progress = db_manager.get_file_progress(file_path)
  494. try:
  495. # 尝试不同的编码方式读取文件
  496. encodings = ['utf-8', 'gbk', 'gb2312', 'latin1']
  497. content = None
  498. for encoding in encodings:
  499. try:
  500. with open(file_path, 'r', encoding=encoding) as f:
  501. content = f.read()
  502. logging.info(f"成功使用 {encoding} 编码读取文件: {file_path}")
  503. break
  504. except UnicodeDecodeError:
  505. continue
  506. if content is None:
  507. raise Exception(f"无法使用支持的编码读取文件: {file_path}")
  508. # 使用正则表达式提取body标签内的内容和title标签
  509. body_pattern = re.compile(r'<body[^>]*>(.*?)</body>', re.DOTALL)
  510. title_pattern = re.compile(r'<title>(.*?)</title>', re.DOTALL)
  511. body_match = body_pattern.search(content)
  512. title_match = title_pattern.search(content)
  513. if not body_match:
  514. logging.warning(f"警告: {file_path} 中没有找到body标签")
  515. return
  516. body_content = body_match.group(1)
  517. # 处理title标签
  518. if title_match:
  519. title_content = title_match.group(1).strip()
  520. if title_content: # 只有当标题内容不为空时才处理
  521. logging.info(f"开始翻译标题: {title_content}")
  522. translated_title = translate_text(title_content)
  523. # 替换原始title内容
  524. content = content.replace(f"<title>{title_content}</title>", f"<title>{translated_title}</title>")
  525. logging.info(f"标题翻译完成: {translated_title}")
  526. else:
  527. logging.info("跳过空标题")
  528. # 按行分割内容,保留所有非空行
  529. lines = [line.strip() for line in body_content.split('\n') if line.strip()]
  530. total_lines = len(lines)
  531. logging.info(f"文件 {file_path} 共有 {total_lines} 行需要处理")
  532. # 获取已完成的翻译
  533. completed_lines = get_completed_groups(conn, file_path)
  534. completed_indices = {line[0] for line in completed_lines}
  535. # 获取断点续传位置
  536. start_line = resume_translation(file_path, db_manager)
  537. # 计算已处理的进度
  538. if progress:
  539. progress_percentage = round(progress['processed_lines']*100/progress['total_lines'], 2)
  540. logging.info(f"文件 {file_path} 已处理进度: {progress['processed_lines']}/{progress['total_lines']} 行 ({progress_percentage}%)")
  541. # 逐行处理内容
  542. translated_lines = []
  543. try:
  544. with tqdm(range(start_line, len(lines)), desc=f"处理文件 {os.path.basename(file_path)}", unit="行") as pbar:
  545. for i in range(start_line, len(lines)):
  546. # 计算当前组的大小
  547. current_group_size = calculate_group_size(len(lines[i]))
  548. group_index = i // current_group_size
  549. # 检查是否已完成
  550. if group_index in completed_indices:
  551. # 使用已完成的翻译
  552. for line in completed_lines:
  553. if line[0] == group_index:
  554. translated_lines.extend(line[1].split('\n'))
  555. break
  556. pbar.update(current_group_size)
  557. continue
  558. # 获取当前组的行
  559. group = lines[i:i+current_group_size]
  560. if group:
  561. try:
  562. # 收集需要翻译的段落
  563. paragraphs_to_translate = []
  564. paragraph_indices = []
  565. for idx, line in enumerate(group):
  566. if '<p class' in line or line.startswith('<h'):
  567. paragraphs_to_translate.append(line)
  568. paragraph_indices.append(idx)
  569. # 如果有需要翻译的段落,进行翻译
  570. if paragraphs_to_translate:
  571. # 将所有需要翻译的段落合并成一个文本
  572. combined_text = "\n".join(paragraphs_to_translate)
  573. logging.info(f"开始翻译第 {i+1}-{min(i+current_group_size, len(lines))} 行")
  574. translated_text = translate_text(combined_text)
  575. # 分割翻译后的文本
  576. translated_paragraphs = translated_text.split('\n')
  577. # 将翻译后的段落放回原位置
  578. translated_group = group.copy()
  579. for idx, translated in zip(paragraph_indices, translated_paragraphs):
  580. translated_group[idx] = translated
  581. else:
  582. translated_group = group
  583. # 保存原始文本和翻译后的文本
  584. original_text = "\n".join(group)
  585. translated_text = "\n".join(translated_group)
  586. # 更新翻译组进度
  587. db_manager.update_group_progress(file_path, group_index, original_text, translated_text, 'completed')
  588. # 分割翻译后的文本
  589. translated_lines.extend(translated_group)
  590. # 更新文件进度
  591. processed_lines = min((group_index + 1) * current_group_size, total_lines)
  592. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'in_progress')
  593. # 显示当前统计信息
  594. stats = translation_stats.get_stats()
  595. pbar.set_postfix(stats)
  596. # 添加较小的延迟以避免API限制
  597. time.sleep(0.1)
  598. except Exception as e:
  599. logging.error(f"处理组 {group_index} 时出错: {str(e)}")
  600. # 记录错误但继续处理
  601. db_manager.log_error(file_path, group_index, "group_processing_error", str(e))
  602. continue
  603. pbar.update(current_group_size)
  604. # 替换原始内容
  605. if translated_lines:
  606. # 构建新的body内容
  607. new_body_content = []
  608. current_index = 0
  609. # 遍历原始内容,替换需要翻译的部分
  610. for line in body_content.split('\n'):
  611. line = line.strip()
  612. if not line:
  613. new_body_content.append('')
  614. continue
  615. if line.startswith('<'):
  616. if ('<p class' in line or line.startswith('<h')) and current_index < len(translated_lines):
  617. # 替换翻译后的内容
  618. new_body_content.append(translated_lines[current_index])
  619. current_index += 1
  620. else:
  621. # 保持原样
  622. new_body_content.append(line)
  623. else:
  624. # 保持非HTML内容原样
  625. new_body_content.append(line)
  626. # 将新内容重新组合
  627. new_body_content = '\n'.join(new_body_content)
  628. # 替换原始内容中的body部分
  629. new_content = content.replace(body_content, new_body_content)
  630. # 保存修改后的文件
  631. output_dir = config.get('paths', 'output_dir')
  632. os.makedirs(output_dir, exist_ok=True)
  633. output_path = os.path.join(output_dir, os.path.basename(file_path))
  634. with open(output_path, 'w', encoding='utf-8') as f:
  635. f.write(new_content)
  636. # 更新完成状态
  637. db_manager.update_file_progress(file_path, total_lines, total_lines, 'completed')
  638. logging.info(f"文件 {file_path} 翻译完成,已保存到 {output_path}")
  639. # 显示最终统计信息
  640. logging.info("\n翻译统计信息:")
  641. for key, value in translation_stats.get_stats().items():
  642. logging.info(f"{key}: {value}")
  643. except KeyboardInterrupt:
  644. logging.warning("\n检测到中断,保存当前进度...")
  645. if 'processed_lines' in locals():
  646. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'interrupted')
  647. # 显示中断时的统计信息
  648. logging.info("\n中断时的统计信息:")
  649. for key, value in translation_stats.get_stats().items():
  650. logging.info(f"{key}: {value}")
  651. raise
  652. except Exception as e:
  653. logging.error(f"处理文件时出错: {str(e)}")
  654. if 'processed_lines' in locals():
  655. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'error')
  656. raise
  657. except Exception as e:
  658. logging.error(f"读取文件时出错: {str(e)}")
  659. return
  660. def main():
  661. ops_dir = config.get('paths', 'input_dir')
  662. html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
  663. # 按文件名排序
  664. html_files.sort()
  665. total_files = len(html_files)
  666. print(f"找到 {total_files} 个HTML文件需要处理")
  667. print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  668. # 初始化数据库连接
  669. conn = db_manager.get_connection()
  670. try:
  671. for file_index, filename in enumerate(html_files, 1):
  672. file_path = os.path.join(ops_dir, filename)
  673. print(f"\n开始处理第 {file_index}/{total_files} 个文件: {filename}")
  674. print("-" * 50)
  675. # 检查文件是否已完成
  676. progress = db_manager.get_file_progress(file_path)
  677. if progress and progress['status'] == 'completed':
  678. print(f"文件 {filename} 已经完成翻译,跳过")
  679. continue
  680. try:
  681. process_html_file(file_path, conn)
  682. print(f"\n完成第 {file_index}/{total_files} 个文件: {filename}")
  683. print("-" * 50)
  684. except Exception as e:
  685. print(f"\n处理文件 {filename} 时出错: {str(e)}")
  686. print("继续处理下一个文件...")
  687. continue
  688. # 显示当前总体进度
  689. completed_files = sum(1 for f in html_files[:file_index]
  690. if db_manager.get_file_progress(os.path.join(ops_dir, f)) and
  691. db_manager.get_file_progress(os.path.join(ops_dir, f))['status'] == 'completed')
  692. print(f"\n总体进度: {completed_files}/{total_files} 个文件完成 "
  693. f"({round(completed_files*100/total_files, 2)}%)")
  694. # 显示统计信息
  695. print("\n当前统计信息:")
  696. for key, value in translation_stats.get_stats().items():
  697. print(f"{key}: {value}")
  698. # 在文件之间添加短暂延迟
  699. if file_index < total_files:
  700. print("\n等待 5 秒后处理下一个文件...")
  701. time.sleep(5)
  702. except KeyboardInterrupt:
  703. print("\n程序被用户中断")
  704. finally:
  705. db_manager.close()
  706. print(f"\n结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  707. print("\n最终统计信息:")
  708. for key, value in translation_stats.get_stats().items():
  709. print(f"{key}: {value}")
  710. if __name__ == "__main__":
  711. main()