translate_epub_v5(多线程版本).py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974
  1. import os
  2. import re
  3. from bs4 import BeautifulSoup
  4. import openai
  5. import time
  6. from tqdm import tqdm
  7. import sqlite3
  8. import json
  9. from datetime import datetime
  10. import logging
  11. from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
  12. import asyncio
  13. import aiohttp
  14. from concurrent.futures import ThreadPoolExecutor
  15. from functools import lru_cache
  16. import hashlib
  17. import yaml
  18. from pathlib import Path
  19. import multiprocessing
  20. from multiprocessing import Pool, Manager, Lock
  21. import queue
  22. # 配置管理
  23. class Config:
  24. def __init__(self, config_path='config.yaml'):
  25. self.config_path = config_path
  26. self.config = self.load_config()
  27. # 设置日志
  28. self.setup_logging()
  29. # 初始化OpenAI客户端
  30. self.setup_openai()
  31. def load_config(self):
  32. """加载配置文件"""
  33. if not os.path.exists(self.config_path):
  34. # 创建默认配置
  35. default_config = {
  36. 'logging': {
  37. 'level': 'INFO',
  38. 'format': '%(asctime)s - %(levelname)s - %(message)s',
  39. 'file': 'translation.log'
  40. },
  41. 'openai': {
  42. 'base_url': 'https://api.siliconflow.cn/v1',
  43. 'api_key': 'sk-',
  44. 'model_name': 'deepseek-ai/DeepSeek-R1',
  45. 'max_retries': 3,
  46. 'retry_delay': 2,
  47. 'timeout': 30,
  48. 'max_concurrent_requests': 5
  49. },
  50. 'translation': {
  51. 'min_line_count': 1,
  52. 'max_line_count': 5,
  53. 'initial_line_count': 2,
  54. 'error_threshold': 3,
  55. 'success_threshold': 5,
  56. 'error_cooldown': 60,
  57. 'cache_size': 1000
  58. },
  59. 'database': {
  60. 'path': 'translation_progress.db',
  61. 'pool_size': 5
  62. },
  63. 'paths': {
  64. 'input_dir': '002/Ops',
  65. 'output_dir': '002/Ops_translated'
  66. }
  67. }
  68. # 保存默认配置
  69. with open(self.config_path, 'w', encoding='utf-8') as f:
  70. yaml.dump(default_config, f, allow_unicode=True)
  71. return default_config
  72. # 加载现有配置
  73. with open(self.config_path, 'r', encoding='utf-8') as f:
  74. return yaml.safe_load(f)
  75. def setup_logging(self):
  76. """设置日志"""
  77. logging.basicConfig(
  78. level=getattr(logging, self.config['logging']['level']),
  79. format=self.config['logging']['format'],
  80. handlers=[
  81. logging.FileHandler(self.config['logging']['file']),
  82. logging.StreamHandler()
  83. ]
  84. )
  85. def setup_openai(self):
  86. """设置OpenAI客户端"""
  87. self.client = openai.OpenAI(
  88. base_url=self.config['openai']['base_url'],
  89. api_key=self.config['openai']['api_key']
  90. )
  91. def get(self, *keys):
  92. """获取配置值"""
  93. value = self.config
  94. for key in keys:
  95. value = value[key]
  96. return value
  97. def update(self, updates):
  98. """更新配置"""
  99. def deep_update(d, u):
  100. for k, v in u.items():
  101. if isinstance(v, dict):
  102. d[k] = deep_update(d.get(k, {}), v)
  103. else:
  104. d[k] = v
  105. return d
  106. self.config = deep_update(self.config, updates)
  107. # 保存更新后的配置
  108. with open(self.config_path, 'w', encoding='utf-8') as f:
  109. yaml.dump(self.config, f, allow_unicode=True)
  110. # 重新设置日志和OpenAI客户端
  111. self.setup_logging()
  112. self.setup_openai()
  113. # 创建全局的配置实例
  114. config = Config()
  115. # 更新全局变量
  116. MODEL_CONFIG = {
  117. "model_name": config.get('openai', 'model_name'),
  118. "max_retries": config.get('openai', 'max_retries'),
  119. "retry_delay": config.get('openai', 'retry_delay'),
  120. "timeout": config.get('openai', 'timeout'),
  121. "max_concurrent_requests": config.get('openai', 'max_concurrent_requests'),
  122. "cache_size": config.get('translation', 'cache_size')
  123. }
  124. MIN_LINE_COUNT = config.get('translation', 'min_line_count')
  125. MAX_LINE_COUNT = config.get('translation', 'max_line_count')
  126. INITIAL_LINE_COUNT = config.get('translation', 'initial_line_count')
  127. ERROR_THRESHOLD = config.get('translation', 'error_threshold')
  128. SUCCESS_THRESHOLD = config.get('translation', 'success_threshold')
  129. # 更新其他类的初始化参数
  130. class LineCountManager:
  131. def __init__(self):
  132. self.current_line_count = INITIAL_LINE_COUNT
  133. self.consecutive_errors = 0
  134. self.consecutive_successes = 0
  135. self.last_error_time = None
  136. self.error_cooldown = config.get('translation', 'error_cooldown')
  137. self.version = f"1.0.{INITIAL_LINE_COUNT}"
  138. self.error_history = []
  139. def adjust_line_count(self, success):
  140. """根据翻译结果调整行数"""
  141. current_time = time.time()
  142. # 检查是否在冷却期内
  143. if self.last_error_time and (current_time - self.last_error_time) < self.error_cooldown:
  144. return self.current_line_count
  145. if success:
  146. self.consecutive_errors = 0
  147. self.consecutive_successes = 0 # 重置成功计数,但不增加行数
  148. else:
  149. self.consecutive_successes = 0
  150. self.consecutive_errors += 1
  151. self.last_error_time = current_time
  152. # 记录错误
  153. self.error_history.append({
  154. 'time': current_time,
  155. 'line_count': self.current_line_count
  156. })
  157. # 如果连续错误次数达到阈值,减少行数
  158. if self.consecutive_errors >= ERROR_THRESHOLD:
  159. if self.current_line_count > MIN_LINE_COUNT:
  160. self.current_line_count -= 1
  161. self.consecutive_errors = 0
  162. self.version = f"1.0.{self.current_line_count}"
  163. logging.warning(f"翻译连续失败,减少行数到 {self.current_line_count},版本更新为 {self.version}")
  164. return self.current_line_count
  165. def get_error_stats(self):
  166. """获取错误统计信息"""
  167. if not self.error_history:
  168. return "无错误记录"
  169. recent_errors = [e for e in self.error_history if time.time() - e['time'] < 3600] # 最近一小时的错误
  170. return {
  171. "总错误数": len(self.error_history),
  172. "最近一小时错误数": len(recent_errors),
  173. "当前行数": self.current_line_count,
  174. "连续错误": self.consecutive_errors,
  175. "连续成功": self.consecutive_successes
  176. }
  177. class DatabaseManager:
  178. def __init__(self):
  179. self.db_path = config.get('database', 'path')
  180. self.conn = None
  181. self.init_db()
  182. def get_connection(self):
  183. """获取数据库连接"""
  184. if self.conn is None:
  185. self.conn = sqlite3.connect(self.db_path)
  186. self.conn.row_factory = sqlite3.Row
  187. return self.conn
  188. def close(self):
  189. """关闭数据库连接"""
  190. if self.conn:
  191. self.conn.close()
  192. self.conn = None
  193. def init_db(self):
  194. """初始化数据库"""
  195. conn = self.get_connection()
  196. c = conn.cursor()
  197. # 创建文件进度表
  198. c.execute('''
  199. CREATE TABLE IF NOT EXISTS file_progress (
  200. file_path TEXT PRIMARY KEY,
  201. total_lines INTEGER,
  202. processed_lines INTEGER,
  203. status TEXT,
  204. version TEXT,
  205. last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  206. error_count INTEGER DEFAULT 0,
  207. retry_count INTEGER DEFAULT 0
  208. )
  209. ''')
  210. # 创建翻译组进度表
  211. c.execute('''
  212. CREATE TABLE IF NOT EXISTS group_progress (
  213. id INTEGER PRIMARY KEY AUTOINCREMENT,
  214. file_path TEXT,
  215. group_index INTEGER,
  216. original_text TEXT,
  217. translated_text TEXT,
  218. status TEXT,
  219. version TEXT,
  220. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  221. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  222. error_count INTEGER DEFAULT 0,
  223. retry_count INTEGER DEFAULT 0,
  224. UNIQUE(file_path, group_index, version)
  225. )
  226. ''')
  227. # 创建错误日志表
  228. c.execute('''
  229. CREATE TABLE IF NOT EXISTS error_log (
  230. id INTEGER PRIMARY KEY AUTOINCREMENT,
  231. file_path TEXT,
  232. group_index INTEGER,
  233. error_type TEXT,
  234. error_message TEXT,
  235. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  236. resolved_at TIMESTAMP,
  237. resolution TEXT
  238. )
  239. ''')
  240. conn.commit()
  241. def begin_transaction(self):
  242. """开始事务"""
  243. self.get_connection().execute('BEGIN TRANSACTION')
  244. def commit_transaction(self):
  245. """提交事务"""
  246. self.get_connection().commit()
  247. def rollback_transaction(self):
  248. """回滚事务"""
  249. self.get_connection().rollback()
  250. def get_file_progress(self, file_path):
  251. """获取文件翻译进度"""
  252. c = self.get_connection().cursor()
  253. c.execute('SELECT * FROM file_progress WHERE file_path = ?', (file_path,))
  254. return c.fetchone()
  255. def update_file_progress(self, file_path, total_lines, processed_lines, status):
  256. """更新文件翻译进度"""
  257. c = self.get_connection().cursor()
  258. c.execute('''
  259. INSERT OR REPLACE INTO file_progress
  260. (file_path, total_lines, processed_lines, status, version, last_updated)
  261. VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  262. ''', (file_path, total_lines, processed_lines, status, line_count_manager.version))
  263. self.get_connection().commit()
  264. def get_group_progress(self, file_path, group_index):
  265. """获取翻译组进度"""
  266. c = self.get_connection().cursor()
  267. c.execute('''
  268. SELECT * FROM group_progress
  269. WHERE file_path = ? AND group_index = ? AND version = ?
  270. ''', (file_path, group_index, line_count_manager.version))
  271. return c.fetchone()
  272. def update_group_progress(self, file_path, group_index, original_text, translated_text, status):
  273. """更新翻译组进度"""
  274. c = self.get_connection().cursor()
  275. c.execute('''
  276. INSERT OR REPLACE INTO group_progress
  277. (file_path, group_index, original_text, translated_text, status, version, updated_at)
  278. VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  279. ''', (file_path, group_index, original_text, translated_text, status, line_count_manager.version))
  280. self.get_connection().commit()
  281. def log_error(self, file_path, group_index, error_type, error_message):
  282. """记录错误"""
  283. c = self.get_connection().cursor()
  284. c.execute('''
  285. INSERT INTO error_log
  286. (file_path, group_index, error_type, error_message)
  287. VALUES (?, ?, ?, ?)
  288. ''', (file_path, group_index, error_type, error_message))
  289. self.get_connection().commit()
  290. def get_error_stats(self):
  291. """获取错误统计信息"""
  292. c = self.get_connection().cursor()
  293. c.execute('''
  294. SELECT
  295. COUNT(*) as total_errors,
  296. COUNT(CASE WHEN resolved_at IS NULL THEN 1 END) as unresolved_errors,
  297. COUNT(CASE WHEN created_at > datetime('now', '-1 hour') THEN 1 END) as recent_errors
  298. FROM error_log
  299. ''')
  300. return c.fetchone()
  301. class AsyncTranslationManager:
  302. def __init__(self):
  303. self.semaphore = asyncio.Semaphore(config.get('openai', 'max_concurrent_requests'))
  304. self.session = None
  305. class TranslationCache:
  306. def __init__(self):
  307. self.cache = {}
  308. self.max_size = config.get('translation', 'cache_size')
  309. self.hits = 0
  310. self.misses = 0
  311. # 创建全局实例
  312. line_count_manager = LineCountManager()
  313. db_manager = DatabaseManager()
  314. async_translation_manager = AsyncTranslationManager()
  315. translation_cache = TranslationCache()
  316. # 添加版本控制
  317. VERSION = "1.0.1" # 版本号,用于区分不同版本的翻译
  318. line_count = 2 # 每组行数,越大越快,但越容易出错
  319. class TranslationStats:
  320. def __init__(self):
  321. self.start_time = time.time()
  322. self.total_chars = 0
  323. self.translated_chars = 0
  324. self.total_requests = 0
  325. self.successful_requests = 0
  326. self.failed_requests = 0
  327. def update_stats(self, original_text, translated_text, success=True):
  328. self.total_chars += len(original_text)
  329. self.translated_chars += len(translated_text)
  330. self.total_requests += 1
  331. if success:
  332. self.successful_requests += 1
  333. else:
  334. self.failed_requests += 1
  335. def get_stats(self):
  336. elapsed_time = time.time() - self.start_time
  337. chars_per_second = self.translated_chars / elapsed_time if elapsed_time > 0 else 0
  338. success_rate = (self.successful_requests / self.total_requests * 100) if self.total_requests > 0 else 0
  339. return {
  340. "总字符数": self.total_chars,
  341. "已翻译字符数": self.translated_chars,
  342. "翻译速度": f"{chars_per_second:.2f} 字符/秒",
  343. "成功率": f"{success_rate:.1f}%",
  344. "总请求数": self.total_requests,
  345. "成功请求": self.successful_requests,
  346. "失败请求": self.failed_requests,
  347. "运行时间": f"{elapsed_time:.1f} 秒"
  348. }
  349. def to_dict(self):
  350. """返回可序列化的字典"""
  351. return {
  352. "total_chars": self.total_chars,
  353. "translated_chars": self.translated_chars,
  354. "total_requests": self.total_requests,
  355. "successful_requests": self.successful_requests,
  356. "failed_requests": self.failed_requests,
  357. "elapsed_time": time.time() - self.start_time
  358. }
  359. @classmethod
  360. def from_dict(cls, data):
  361. """从字典创建实例"""
  362. stats = cls()
  363. stats.total_chars = data.get("total_chars", 0)
  364. stats.translated_chars = data.get("translated_chars", 0)
  365. stats.total_requests = data.get("total_requests", 0)
  366. stats.successful_requests = data.get("successful_requests", 0)
  367. stats.failed_requests = data.get("failed_requests", 0)
  368. stats.start_time = time.time() - data.get("elapsed_time", 0)
  369. return stats
  370. # 创建全局的统计对象
  371. translation_stats = TranslationStats()
  372. def get_completed_groups(conn, file_path):
  373. """获取已完成的翻译组"""
  374. c = conn.cursor()
  375. c.execute('''
  376. SELECT group_index, translated_text
  377. FROM group_progress
  378. WHERE file_path = ? AND status = 'completed' AND version = ?
  379. ORDER BY group_index
  380. ''', (file_path, line_count_manager.version))
  381. return c.fetchall()
  382. # """ - 输出内容要求用代码块包裹起来
  383. # ,只在必要时提供相应的语言注释
  384. # """
  385. @retry(
  386. stop=stop_after_attempt(MODEL_CONFIG['max_retries']),
  387. wait=wait_exponential(multiplier=1, min=4, max=10),
  388. retry=retry_if_exception_type((openai.APIError, openai.APITimeoutError)),
  389. before_sleep=lambda retry_state: logging.warning(f"重试第 {retry_state.attempt_number} 次...")
  390. )
  391. def translate_text(text, file_name=None, context=None):
  392. """翻译文本,使用流式输出"""
  393. try:
  394. # 构建上下文信息
  395. context_info = ""
  396. if file_name:
  397. context_info += f"当前正在翻译文件:{file_name}\n"
  398. if context:
  399. context_info += f"上下文信息:{context}\n"
  400. messages = [
  401. {
  402. "role": "system",
  403. "content": f"""你是一个专业的翻译助手,专注于将文本翻译成中文。
  404. - 你正在翻译一个完整的文档,请保持翻译风格的一致性。
  405. - 翻译时要考虑上下文,确保翻译的连贯性。
  406. - 对于专业术语,请保持统一的翻译。
  407. - 保持原文的格式和结构。
  408. - 不要添加任何额外的解释或说明。
  409. - 只返回翻译后的内容,不要包含任何其他内容。
  410. {context_info}"""
  411. },
  412. {
  413. "role": "user",
  414. "content": text
  415. }
  416. ]
  417. # 使用流式输出
  418. stream = config.client.chat.completions.create(
  419. model=MODEL_CONFIG['model_name'],
  420. messages=messages,
  421. timeout=MODEL_CONFIG['timeout'],
  422. stream=True # 启用流式输出
  423. )
  424. # 收集流式输出的内容
  425. translated_text = ""
  426. for chunk in stream:
  427. if chunk.choices[0].delta.content is not None:
  428. content = chunk.choices[0].delta.content
  429. translated_text += content
  430. # 实时打印翻译内容
  431. print(content, end='', flush=True)
  432. print() # 换行
  433. # 更新统计信息
  434. if hasattr(process_files_batch, 'process_stats'):
  435. process_files_batch.process_stats.update_stats(text, translated_text, True)
  436. return translated_text
  437. except Exception as e:
  438. logging.error(f"翻译出错: {str(e)}")
  439. # 更新统计信息
  440. if hasattr(process_files_batch, 'process_stats'):
  441. process_files_batch.process_stats.update_stats(text, "", False)
  442. raise
  443. def process_html_file(file_path, conn):
  444. """处理HTML文件"""
  445. # 检查文件进度
  446. progress = db_manager.get_file_progress(file_path)
  447. try:
  448. # 尝试不同的编码方式读取文件
  449. encodings = ['utf-8', 'gbk', 'gb2312', 'latin1']
  450. content = None
  451. for encoding in encodings:
  452. try:
  453. with open(file_path, 'r', encoding=encoding) as f:
  454. content = f.read()
  455. break
  456. except UnicodeDecodeError:
  457. continue
  458. if content is None:
  459. raise Exception(f"无法使用支持的编码读取文件: {file_path}")
  460. # 使用正则表达式提取body标签内的内容
  461. body_pattern = re.compile(r'<body[^>]*>(.*?)</body>', re.DOTALL)
  462. body_match = body_pattern.search(content)
  463. if not body_match:
  464. print(f"警告: {file_path} 中没有找到body标签")
  465. return
  466. body_content = body_match.group(1)
  467. # 按行分割内容,保留所有HTML标签行,但只翻译包含 <p class 的行
  468. lines = []
  469. for line in body_content.split('\n'):
  470. line = line.strip()
  471. if line and line.startswith('<'):
  472. lines.append(line)
  473. total_lines = len(lines)
  474. # 获取已完成的翻译组
  475. completed_groups = get_completed_groups(conn, file_path)
  476. completed_indices = {group[0] for group in completed_groups}
  477. # 计算已处理的进度
  478. if progress:
  479. print(f"文件 {file_path} 已处理进度: {progress[2]}/{progress[1]} 行 ({round(progress[2]*100/progress[1], 2)}%)")
  480. # 按组处理内容
  481. translated_lines = []
  482. try:
  483. with tqdm(range(0, len(lines), line_count_manager.current_line_count),
  484. desc=f"处理文件 {os.path.basename(file_path)}",
  485. unit="组") as pbar:
  486. for i in pbar:
  487. group_index = i // line_count_manager.current_line_count
  488. # 检查是否已完成
  489. if group_index in completed_indices:
  490. # 使用已完成的翻译
  491. for group in completed_groups:
  492. if group[0] == group_index:
  493. translated_lines.extend(group[1].split('\n'))
  494. break
  495. continue
  496. group = lines[i:i+line_count_manager.current_line_count]
  497. if group:
  498. # 保存原始文本
  499. original_text = "\n".join(group)
  500. # 收集需要翻译的段落
  501. paragraphs_to_translate = []
  502. paragraph_indices = []
  503. for idx, line in enumerate(group):
  504. if '<p class' in line or line.startswith('<h'):
  505. paragraphs_to_translate.append(line)
  506. paragraph_indices.append(idx)
  507. # 如果有需要翻译的段落,进行翻译
  508. if paragraphs_to_translate:
  509. translated_paragraphs = []
  510. for paragraph in paragraphs_to_translate:
  511. print(f"\n翻译段落 {len(translated_paragraphs) + 1}/{len(paragraphs_to_translate)}:")
  512. translated_paragraph = translate_text(paragraph)
  513. translated_paragraphs.append(translated_paragraph)
  514. # 将翻译后的段落放回原位置
  515. translated_group = group.copy()
  516. for idx, translated in zip(paragraph_indices, translated_paragraphs):
  517. translated_group[idx] = translated
  518. else:
  519. translated_group = group
  520. translated_text = "\n".join(translated_group)
  521. # 更新翻译组进度
  522. db_manager.update_group_progress(file_path, group_index, original_text, translated_text, 'completed')
  523. # 分割翻译后的文本
  524. translated_lines.extend(translated_group)
  525. # 更新文件进度
  526. processed_lines = min((group_index + 1) * line_count_manager.current_line_count, total_lines)
  527. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'in_progress')
  528. # 显示当前统计信息
  529. stats = translation_stats.get_stats()
  530. pbar.set_postfix(stats)
  531. # 添加较小的延迟以避免API限制
  532. time.sleep(0.1) # 减少延迟时间
  533. # 替换原始内容
  534. if translated_lines:
  535. # 构建新的body内容
  536. new_body_content = []
  537. current_index = 0
  538. # 遍历原始内容,替换需要翻译的部分
  539. for line in body_content.split('\n'):
  540. line = line.strip()
  541. if not line:
  542. new_body_content.append('')
  543. continue
  544. if line.startswith('<'):
  545. if ('<p class' in line or line.startswith('<h')) and current_index < len(translated_lines):
  546. # 替换翻译后的内容
  547. new_body_content.append(translated_lines[current_index])
  548. current_index += 1
  549. else:
  550. # 保持原样
  551. new_body_content.append(line)
  552. else:
  553. # 保持非HTML内容原样
  554. new_body_content.append(line)
  555. # 将新内容重新组合
  556. new_body_content = '\n'.join(new_body_content)
  557. # 替换原始内容中的body部分
  558. new_content = content.replace(body_content, new_body_content)
  559. # 保存修改后的文件
  560. output_dir = config.get('paths', 'output_dir')
  561. os.makedirs(output_dir, exist_ok=True)
  562. output_path = os.path.join(output_dir, os.path.basename(file_path))
  563. with open(output_path, 'w', encoding='utf-8') as f:
  564. f.write(new_content)
  565. # 更新完成状态
  566. db_manager.update_file_progress(file_path, total_lines, total_lines, 'completed')
  567. print(f"文件 {file_path} 翻译完成,已保存到 {output_path}")
  568. # 显示最终统计信息
  569. print("\n翻译统计信息:")
  570. for key, value in translation_stats.get_stats().items():
  571. print(f"{key}: {value}")
  572. except KeyboardInterrupt:
  573. print("\n检测到中断,保存当前进度...")
  574. if 'processed_lines' in locals():
  575. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'interrupted')
  576. # 显示中断时的统计信息
  577. print("\n中断时的统计信息:")
  578. for key, value in translation_stats.get_stats().items():
  579. print(f"{key}: {value}")
  580. raise
  581. except Exception as e:
  582. print(f"处理文件时出错: {str(e)}")
  583. if 'processed_lines' in locals():
  584. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'error')
  585. raise
  586. except Exception as e:
  587. print(f"读取文件时出错: {str(e)}")
  588. return
  589. def process_files_batch(file_batch, process_id):
  590. """处理一批文件的函数,用于多进程执行"""
  591. try:
  592. # 为每个进程创建独立的数据库连接
  593. process_db = DatabaseManager()
  594. conn = process_db.get_connection()
  595. # 创建进程级别的统计对象
  596. process_stats = TranslationStats()
  597. # 创建文件锁
  598. file_lock = Lock()
  599. def translate_with_stats(text, file_name, context=None):
  600. """包装翻译函数以收集统计信息"""
  601. with file_lock: # 使用文件锁确保同一时间只有一个翻译请求
  602. return translate_text(text, file_name, context)
  603. for filename in tqdm(file_batch, desc=f"进程 {process_id} 处理文件", unit="文件"):
  604. file_path = os.path.join(config.get('paths', 'input_dir'), filename)
  605. try:
  606. # 尝试不同的编码方式读取文件
  607. encodings = ['utf-8', 'gbk', 'gb2312', 'latin1']
  608. content = None
  609. for encoding in encodings:
  610. try:
  611. with open(file_path, 'r', encoding=encoding) as f:
  612. content = f.read()
  613. break
  614. except UnicodeDecodeError:
  615. continue
  616. if content is None:
  617. raise Exception(f"无法使用支持的编码读取文件: {file_path}")
  618. # 使用正则表达式提取body标签内的内容
  619. body_pattern = re.compile(r'<body[^>]*>(.*?)</body>', re.DOTALL)
  620. body_match = body_pattern.search(content)
  621. if not body_match:
  622. print(f"警告: {file_path} 中没有找到body标签")
  623. continue
  624. body_content = body_match.group(1)
  625. # 按行分割内容,保留所有HTML标签行,但只翻译包含 <p class 的行
  626. lines = []
  627. for line in body_content.split('\n'):
  628. line = line.strip()
  629. if line and line.startswith('<'):
  630. lines.append(line)
  631. total_lines = len(lines)
  632. # 获取已完成的翻译组
  633. completed_groups = get_completed_groups(conn, file_path)
  634. completed_indices = {group[0] for group in completed_groups}
  635. # 计算已处理的进度
  636. progress = db_manager.get_file_progress(file_path)
  637. if progress:
  638. print(f"文件 {file_path} 已处理进度: {progress[2]}/{progress[1]} 行 ({round(progress[2]*100/progress[1], 2)}%)")
  639. # 按组处理内容
  640. translated_lines = []
  641. with tqdm(range(0, len(lines), line_count_manager.current_line_count),
  642. desc=f"处理文件 {os.path.basename(file_path)}",
  643. unit="组") as pbar:
  644. for i in pbar:
  645. group_index = i // line_count_manager.current_line_count
  646. # 检查是否已完成
  647. if group_index in completed_indices:
  648. # 使用已完成的翻译
  649. for group in completed_groups:
  650. if group[0] == group_index:
  651. translated_lines.extend(group[1].split('\n'))
  652. break
  653. continue
  654. group = lines[i:i+line_count_manager.current_line_count]
  655. if group:
  656. # 保存原始文本
  657. original_text = "\n".join(group)
  658. # 收集需要翻译的段落
  659. paragraphs_to_translate = []
  660. paragraph_indices = []
  661. for idx, line in enumerate(group):
  662. if '<p class' in line or line.startswith('<h'):
  663. paragraphs_to_translate.append(line)
  664. paragraph_indices.append(idx)
  665. # 如果有需要翻译的段落,进行翻译
  666. if paragraphs_to_translate:
  667. translated_paragraphs = []
  668. for paragraph in paragraphs_to_translate:
  669. print(f"\n翻译段落 {len(translated_paragraphs) + 1}/{len(paragraphs_to_translate)}:")
  670. # 获取上下文(前一段和后一段)
  671. context = ""
  672. if len(translated_lines) > 0:
  673. context += f"前文:{translated_lines[-1]}\n"
  674. if i + line_count_manager.current_line_count < len(lines):
  675. context += f"后文:{lines[i + line_count_manager.current_line_count]}"
  676. translated_paragraph = translate_with_stats(
  677. paragraph,
  678. filename,
  679. context
  680. )
  681. translated_paragraphs.append(translated_paragraph)
  682. # 将翻译后的段落放回原位置
  683. translated_group = group.copy()
  684. for idx, translated in zip(paragraph_indices, translated_paragraphs):
  685. translated_group[idx] = translated
  686. else:
  687. translated_group = group
  688. translated_text = "\n".join(translated_group)
  689. # 更新翻译组进度
  690. db_manager.update_group_progress(file_path, group_index, original_text, translated_text, 'completed')
  691. # 分割翻译后的文本
  692. translated_lines.extend(translated_group)
  693. # 更新文件进度
  694. processed_lines = min((group_index + 1) * line_count_manager.current_line_count, total_lines)
  695. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'in_progress')
  696. # 显示当前统计信息
  697. stats = process_stats.get_stats()
  698. pbar.set_postfix(stats)
  699. # 添加较小的延迟以避免API限制
  700. time.sleep(0.1) # 减少延迟时间
  701. # 替换原始内容
  702. if translated_lines:
  703. # 构建新的body内容
  704. new_body_content = []
  705. current_index = 0
  706. # 遍历原始内容,替换需要翻译的部分
  707. for line in body_content.split('\n'):
  708. line = line.strip()
  709. if not line:
  710. new_body_content.append('')
  711. continue
  712. if line.startswith('<'):
  713. if ('<p class' in line or line.startswith('<h')) and current_index < len(translated_lines):
  714. # 替换翻译后的内容
  715. new_body_content.append(translated_lines[current_index])
  716. current_index += 1
  717. else:
  718. # 保持原样
  719. new_body_content.append(line)
  720. else:
  721. # 保持非HTML内容原样
  722. new_body_content.append(line)
  723. # 将新内容重新组合
  724. new_body_content = '\n'.join(new_body_content)
  725. # 替换原始内容中的body部分
  726. new_content = content.replace(body_content, new_body_content)
  727. # 保存修改后的文件
  728. output_dir = config.get('paths', 'output_dir')
  729. os.makedirs(output_dir, exist_ok=True)
  730. output_path = os.path.join(output_dir, os.path.basename(file_path))
  731. with open(output_path, 'w', encoding='utf-8') as f:
  732. f.write(new_content)
  733. # 更新完成状态
  734. db_manager.update_file_progress(file_path, total_lines, total_lines, 'completed')
  735. print(f"文件 {file_path} 翻译完成,已保存到 {output_path}")
  736. # 显示最终统计信息
  737. print("\n翻译统计信息:")
  738. for key, value in process_stats.get_stats().items():
  739. print(f"{key}: {value}")
  740. except Exception as e:
  741. print(f"处理文件时出错: {str(e)}")
  742. if 'processed_lines' in locals():
  743. db_manager.update_file_progress(file_path, total_lines, processed_lines, 'error')
  744. raise
  745. # 返回进程的统计信息
  746. return process_stats.to_dict()
  747. except Exception as e:
  748. logging.error(f"进程 {process_id} 发生错误: {str(e)}")
  749. return None
  750. finally:
  751. if 'process_db' in locals():
  752. process_db.close()
  753. def aggregate_stats(stats_list):
  754. """聚合所有进程的统计信息"""
  755. aggregated_stats = {
  756. "total_chars": 0,
  757. "translated_chars": 0,
  758. "total_requests": 0,
  759. "successful_requests": 0,
  760. "failed_requests": 0,
  761. "elapsed_time": 0
  762. }
  763. for stats in stats_list:
  764. if not stats:
  765. continue
  766. for key in aggregated_stats:
  767. if key in stats:
  768. if key == "elapsed_time":
  769. aggregated_stats[key] = max(aggregated_stats[key], stats[key])
  770. else:
  771. aggregated_stats[key] += stats[key]
  772. # 创建统计对象并格式化输出
  773. final_stats = TranslationStats.from_dict(aggregated_stats)
  774. return final_stats.get_stats()
  775. def main():
  776. # 设置进程数
  777. num_processes = 2
  778. # 获取输入目录中的所有HTML文件
  779. input_dir = config.get('paths', 'input_dir')
  780. html_files = [f for f in os.listdir(input_dir) if f.endswith('.html')]
  781. print(f"找到 {len(html_files)} 个HTML文件需要处理")
  782. print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  783. # 调整进程数,确保不超过文件数量
  784. num_processes = min(num_processes, len(html_files))
  785. # 将文件列表分成多个批次
  786. if num_processes > 0:
  787. batch_size = max(1, len(html_files) // num_processes)
  788. file_batches = [html_files[i:i + batch_size] for i in range(0, len(html_files), batch_size)]
  789. else:
  790. print("没有找到需要处理的HTML文件")
  791. return
  792. try:
  793. # 创建进程池
  794. with Pool(processes=num_processes) as pool:
  795. # 启动多个进程处理文件并收集结果
  796. results = []
  797. for i, batch in enumerate(file_batches):
  798. result = pool.apply_async(process_files_batch, args=(batch, i+1))
  799. results.append(result)
  800. # 等待所有进程完成并收集结果
  801. stats_list = [r.get() for r in results]
  802. except Exception as e:
  803. logging.error(f"进程池执行出错: {str(e)}")
  804. stats_list = []
  805. # 聚合统计信息
  806. final_stats = aggregate_stats(stats_list)
  807. print(f"\n结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  808. print("\n最终统计信息:")
  809. for key, value in final_stats.items():
  810. print(f"{key}: {value}")
  811. if __name__ == "__main__":
  812. # 设置多进程启动方法
  813. multiprocessing.set_start_method('spawn')
  814. # 设置日志格式
  815. logging.basicConfig(
  816. level=logging.INFO,
  817. format='%(asctime)s - %(levelname)s - %(message)s'
  818. )
  819. main()