translate_epub_v2.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. import os
  2. import re
  3. from bs4 import BeautifulSoup
  4. import openai
  5. import time
  6. from tqdm import tqdm
  7. import sqlite3
  8. import json
  9. from datetime import datetime
  10. # 初始化OpenAI客户端
  11. client = openai.OpenAI(
  12. # chatnio
  13. # base_url="https://api.chatnio.net/v1",
  14. # api_key="sk-"
  15. # deepseek
  16. # base_url="https://api.deepseek.com/v1",
  17. # api_key="sk-"
  18. # Qwen/Qwen3-32B
  19. base_url="https://api.siliconflow.cn/v1",
  20. api_key="sk-"
  21. )
  22. # model_name = "Qwen/Qwen3-32B" # Qwen/Qwen3-32B
  23. model_name = "deepseek-ai/DeepSeek-R1" # deepseek-ai/DeepSeek-R1
  24. # 添加版本控制
  25. VERSION = "1.0.1" # 版本号,用于区分不同版本的翻译
  26. line_count = 2 # 每组行数,越大越快,但越容易出错
  27. # 自动调整参数
  28. MIN_LINE_COUNT = 1
  29. MAX_LINE_COUNT = 5
  30. INITIAL_LINE_COUNT = 2
  31. ERROR_THRESHOLD = 3 # 连续错误次数阈值
  32. SUCCESS_THRESHOLD = 5 # 连续成功次数阈值
  33. class LineCountManager:
  34. def __init__(self):
  35. self.current_line_count = INITIAL_LINE_COUNT
  36. self.consecutive_errors = 0
  37. self.consecutive_successes = 0
  38. self.last_error_time = None
  39. self.error_cooldown = 60 # 错误冷却时间(秒)
  40. self.version = f"1.0.{INITIAL_LINE_COUNT}" # 初始版本号
  41. def adjust_line_count(self, success):
  42. current_time = time.time()
  43. # 检查是否在冷却期内
  44. if self.last_error_time and (current_time - self.last_error_time) < self.error_cooldown:
  45. return self.current_line_count
  46. if success:
  47. self.consecutive_errors = 0
  48. self.consecutive_successes += 1
  49. # 如果连续成功次数达到阈值,尝试增加行数
  50. if self.consecutive_successes >= SUCCESS_THRESHOLD:
  51. if self.current_line_count < MAX_LINE_COUNT:
  52. self.current_line_count += 1
  53. self.consecutive_successes = 0
  54. self.version = f"1.0.{self.current_line_count}" # 更新版本号
  55. print(f"翻译连续成功,增加行数到 {self.current_line_count},版本更新为 {self.version}")
  56. else:
  57. self.consecutive_successes = 0
  58. self.consecutive_errors += 1
  59. self.last_error_time = current_time
  60. # 如果连续错误次数达到阈值,减少行数
  61. if self.consecutive_errors >= ERROR_THRESHOLD:
  62. if self.current_line_count > MIN_LINE_COUNT:
  63. self.current_line_count -= 1
  64. self.consecutive_errors = 0
  65. self.version = f"1.0.{self.current_line_count}" # 更新版本号
  66. print(f"翻译连续失败,减少行数到 {self.current_line_count},版本更新为 {self.version}")
  67. return self.current_line_count
  68. # 创建全局的LineCountManager实例
  69. line_count_manager = LineCountManager()
  70. class TranslationStats:
  71. def __init__(self):
  72. self.start_time = time.time()
  73. self.total_chars = 0
  74. self.translated_chars = 0
  75. self.total_requests = 0
  76. self.successful_requests = 0
  77. self.failed_requests = 0
  78. def update_stats(self, original_text, translated_text, success=True):
  79. self.total_chars += len(original_text)
  80. self.translated_chars += len(translated_text)
  81. self.total_requests += 1
  82. if success:
  83. self.successful_requests += 1
  84. else:
  85. self.failed_requests += 1
  86. def get_stats(self):
  87. elapsed_time = time.time() - self.start_time
  88. chars_per_second = self.translated_chars / elapsed_time if elapsed_time > 0 else 0
  89. success_rate = (self.successful_requests / self.total_requests * 100) if self.total_requests > 0 else 0
  90. return {
  91. "总字符数": self.total_chars,
  92. "已翻译字符数": self.translated_chars,
  93. "翻译速度": f"{chars_per_second:.2f} 字符/秒",
  94. "成功率": f"{success_rate:.1f}%",
  95. "总请求数": self.total_requests,
  96. "成功请求": self.successful_requests,
  97. "失败请求": self.failed_requests,
  98. "运行时间": f"{elapsed_time:.1f} 秒"
  99. }
  100. # 创建全局的统计对象
  101. translation_stats = TranslationStats()
  102. def init_db():
  103. """初始化数据库"""
  104. conn = sqlite3.connect('translation_progress.db')
  105. c = conn.cursor()
  106. # 检查是否需要迁移数据库
  107. try:
  108. c.execute("SELECT version FROM file_progress LIMIT 1")
  109. except sqlite3.OperationalError:
  110. # 如果表不存在或没有version字段,进行迁移
  111. print("正在更新数据库结构...")
  112. # 备份旧表
  113. c.execute("ALTER TABLE file_progress RENAME TO file_progress_old")
  114. c.execute("ALTER TABLE group_progress RENAME TO group_progress_old")
  115. # 创建新表
  116. c.execute('''
  117. CREATE TABLE IF NOT EXISTS file_progress (
  118. file_path TEXT PRIMARY KEY,
  119. total_lines INTEGER,
  120. processed_lines INTEGER,
  121. status TEXT,
  122. version TEXT,
  123. last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
  124. )
  125. ''')
  126. c.execute('''
  127. CREATE TABLE IF NOT EXISTS group_progress (
  128. id INTEGER PRIMARY KEY AUTOINCREMENT,
  129. file_path TEXT,
  130. group_index INTEGER,
  131. original_text TEXT,
  132. translated_text TEXT,
  133. status TEXT,
  134. version TEXT,
  135. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  136. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  137. UNIQUE(file_path, group_index, version)
  138. )
  139. ''')
  140. # 迁移数据
  141. try:
  142. c.execute('''
  143. INSERT INTO file_progress
  144. (file_path, total_lines, processed_lines, status, version, last_updated)
  145. SELECT file_path, total_lines, processed_lines, status, ?, last_updated
  146. FROM file_progress_old
  147. ''', (line_count_manager.version,))
  148. c.execute('''
  149. INSERT INTO group_progress
  150. (file_path, group_index, original_text, translated_text, status, version, created_at, updated_at)
  151. SELECT file_path, group_index, original_text, translated_text, status, ?, created_at, updated_at
  152. FROM group_progress_old
  153. ''', (line_count_manager.version,))
  154. # 删除旧表
  155. c.execute("DROP TABLE file_progress_old")
  156. c.execute("DROP TABLE group_progress_old")
  157. print("数据库迁移完成")
  158. except sqlite3.OperationalError as e:
  159. print(f"迁移数据时出错: {str(e)}")
  160. # 如果迁移失败,回滚到原始表
  161. c.execute("DROP TABLE IF EXISTS file_progress")
  162. c.execute("DROP TABLE IF EXISTS group_progress")
  163. c.execute("ALTER TABLE file_progress_old RENAME TO file_progress")
  164. c.execute("ALTER TABLE group_progress_old RENAME TO group_progress")
  165. raise
  166. else:
  167. # 如果表已存在且包含version字段,创建新表
  168. c.execute('''
  169. CREATE TABLE IF NOT EXISTS file_progress (
  170. file_path TEXT PRIMARY KEY,
  171. total_lines INTEGER,
  172. processed_lines INTEGER,
  173. status TEXT,
  174. version TEXT,
  175. last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
  176. )
  177. ''')
  178. c.execute('''
  179. CREATE TABLE IF NOT EXISTS group_progress (
  180. id INTEGER PRIMARY KEY AUTOINCREMENT,
  181. file_path TEXT,
  182. group_index INTEGER,
  183. original_text TEXT,
  184. translated_text TEXT,
  185. status TEXT,
  186. version TEXT,
  187. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  188. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  189. UNIQUE(file_path, group_index, version)
  190. )
  191. ''')
  192. conn.commit()
  193. return conn
  194. def get_file_progress(conn, file_path):
  195. """获取文件翻译进度"""
  196. c = conn.cursor()
  197. c.execute('SELECT * FROM file_progress WHERE file_path = ?', (file_path,))
  198. return c.fetchone()
  199. def update_file_progress(conn, file_path, total_lines, processed_lines, status):
  200. """更新文件翻译进度"""
  201. c = conn.cursor()
  202. c.execute('''
  203. INSERT OR REPLACE INTO file_progress
  204. (file_path, total_lines, processed_lines, status, version, last_updated)
  205. VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  206. ''', (file_path, total_lines, processed_lines, status, line_count_manager.version))
  207. conn.commit()
  208. def get_group_progress(conn, file_path, group_index):
  209. """获取翻译组进度"""
  210. c = conn.cursor()
  211. c.execute('''
  212. SELECT * FROM group_progress
  213. WHERE file_path = ? AND group_index = ?
  214. ''', (file_path, group_index))
  215. return c.fetchone()
  216. def update_group_progress(conn, file_path, group_index, original_text, translated_text, status):
  217. """更新翻译组进度"""
  218. c = conn.cursor()
  219. c.execute('''
  220. INSERT OR REPLACE INTO group_progress
  221. (file_path, group_index, original_text, translated_text, status, version, updated_at)
  222. VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  223. ''', (file_path, group_index, original_text, translated_text, status, line_count_manager.version))
  224. conn.commit()
  225. def get_completed_groups(conn, file_path):
  226. """获取已完成的翻译组"""
  227. c = conn.cursor()
  228. c.execute('''
  229. SELECT group_index, translated_text
  230. FROM group_progress
  231. WHERE file_path = ? AND status = 'completed' AND version = ?
  232. ORDER BY group_index
  233. ''', (file_path, line_count_manager.version))
  234. return c.fetchall()
  235. # """ - 输出内容要求用代码块包裹起来
  236. # ,只在必要时提供相应的语言注释
  237. # """
  238. def translate_text(text, max_retries=3):
  239. """翻译文本,添加重试机制"""
  240. start_time = time.time()
  241. for attempt in range(max_retries):
  242. try:
  243. messages = [
  244. {
  245. "role": "system",
  246. "content": "- 你名为epub翻译大师,专注于将任意语言的文本翻译成中文。- 你在翻译过程中,力求保留原文语意,确保翻译的准确性和完整性。- 你特别注重翻译结果要贴合现代人的阅读习惯,使译文更加流畅易懂。- 在处理包含代码结构的文本时,你会特别注意保持代码的原样。- 你的服务旨在为用户提供高效、便捷的翻译体验,帮助用户跨越语言障碍。- 在回答问题的时候,尽可能保留原来的代码结构。"
  247. },
  248. {
  249. "role": "user",
  250. "content": text
  251. }
  252. ]
  253. response = client.chat.completions.create(
  254. model=model_name,
  255. messages=messages
  256. )
  257. translated_text = response.choices[0].message.content
  258. # 更新统计信息
  259. translation_stats.update_stats(text, translated_text, True)
  260. # 计算并显示本次翻译的速度
  261. elapsed = time.time() - start_time
  262. chars_per_second = len(translated_text) / elapsed if elapsed > 0 else 0
  263. print(f"\n翻译速度: {chars_per_second:.2f} 字符/秒")
  264. # 翻译成功,调整行数
  265. line_count_manager.adjust_line_count(True)
  266. return translated_text
  267. except Exception as e:
  268. if attempt == max_retries - 1:
  269. print(f"翻译失败,已达到最大重试次数: {str(e)}")
  270. # 更新统计信息
  271. translation_stats.update_stats(text, text, False)
  272. # 翻译失败,调整行数
  273. line_count_manager.adjust_line_count(False)
  274. return text
  275. print(f"翻译出错,正在重试 ({attempt + 1}/{max_retries}): {str(e)}")
  276. time.sleep(2 ** attempt) # 指数退避
  277. def process_html_file(file_path, conn):
  278. """处理HTML文件"""
  279. # 检查文件进度
  280. progress = get_file_progress(conn, file_path)
  281. try:
  282. # 尝试不同的编码方式读取文件
  283. encodings = ['utf-8', 'gbk', 'gb2312', 'latin1']
  284. content = None
  285. for encoding in encodings:
  286. try:
  287. with open(file_path, 'r', encoding=encoding) as f:
  288. content = f.read()
  289. break
  290. except UnicodeDecodeError:
  291. continue
  292. if content is None:
  293. raise Exception(f"无法使用支持的编码读取文件: {file_path}")
  294. # 使用正则表达式提取body标签内的内容
  295. body_pattern = re.compile(r'<body[^>]*>(.*?)</body>', re.DOTALL)
  296. body_match = body_pattern.search(content)
  297. if not body_match:
  298. print(f"警告: {file_path} 中没有找到body标签")
  299. return
  300. body_content = body_match.group(1)
  301. # 按行分割内容,保留所有HTML标签行,但只翻译包含 <p class 的行
  302. lines = []
  303. for line in body_content.split('\n'):
  304. line = line.strip()
  305. if line and line.startswith('<'):
  306. lines.append(line)
  307. total_lines = len(lines)
  308. # 获取已完成的翻译组
  309. completed_groups = get_completed_groups(conn, file_path)
  310. completed_indices = {group[0] for group in completed_groups}
  311. # 计算已处理的进度
  312. if progress:
  313. print(f"文件 {file_path} 已处理进度: {progress[2]}/{progress[1]} 行 ({round(progress[2]*100/progress[1], 2)}%)")
  314. # 按组处理内容
  315. translated_lines = []
  316. try:
  317. with tqdm(range(0, len(lines), line_count_manager.current_line_count),
  318. desc=f"处理文件 {os.path.basename(file_path)}",
  319. unit="组") as pbar:
  320. for i in pbar:
  321. group_index = i // line_count_manager.current_line_count
  322. # 检查是否已完成
  323. if group_index in completed_indices:
  324. # 使用已完成的翻译
  325. for group in completed_groups:
  326. if group[0] == group_index:
  327. translated_lines.extend(group[1].split('\n'))
  328. break
  329. continue
  330. group = lines[i:i+line_count_manager.current_line_count]
  331. if group:
  332. # 保存原始文本
  333. original_text = "\n".join(group)
  334. # 收集需要翻译的段落
  335. paragraphs_to_translate = []
  336. paragraph_indices = []
  337. for idx, line in enumerate(group):
  338. if '<p class' in line:
  339. paragraphs_to_translate.append(line)
  340. paragraph_indices.append(idx)
  341. # 如果有需要翻译的段落,进行翻译
  342. if paragraphs_to_translate:
  343. translated_paragraphs = []
  344. for paragraph in paragraphs_to_translate:
  345. translated_paragraph = translate_text(paragraph)
  346. translated_paragraphs.append(translated_paragraph)
  347. # 将翻译后的段落放回原位置
  348. translated_group = group.copy()
  349. for idx, translated in zip(paragraph_indices, translated_paragraphs):
  350. translated_group[idx] = translated
  351. else:
  352. translated_group = group
  353. translated_text = "\n".join(translated_group)
  354. # 更新翻译组进度
  355. update_group_progress(conn, file_path, group_index, original_text, translated_text, 'completed')
  356. # 分割翻译后的文本
  357. translated_lines.extend(translated_group)
  358. # 更新文件进度
  359. processed_lines = min((group_index + 1) * line_count_manager.current_line_count, total_lines)
  360. update_file_progress(conn, file_path, total_lines, processed_lines, 'in_progress')
  361. # 显示当前统计信息
  362. stats = translation_stats.get_stats()
  363. pbar.set_postfix(stats)
  364. # 添加较小的延迟以避免API限制
  365. time.sleep(0.1) # 减少延迟时间
  366. # 替换原始内容
  367. if translated_lines:
  368. # 保持原始内容的顺序和结构
  369. new_body_content = body_content
  370. current_index = 0
  371. # 遍历原始内容,替换需要翻译的部分
  372. for line in body_content.split('\n'):
  373. line = line.strip()
  374. if line and line.startswith('<'):
  375. if '<p class' in line and current_index < len(translated_lines):
  376. # 替换翻译后的内容
  377. new_body_content = new_body_content.replace(line, translated_lines[current_index])
  378. current_index += 1
  379. else:
  380. # 保持原样
  381. continue
  382. new_content = content.replace(body_content, new_body_content)
  383. # 保存修改后的文件
  384. with open(file_path, 'w', encoding='utf-8') as f:
  385. f.write(new_content)
  386. # 更新完成状态
  387. update_file_progress(conn, file_path, total_lines, total_lines, 'completed')
  388. print(f"文件 {file_path} 翻译完成")
  389. # 显示最终统计信息
  390. print("\n翻译统计信息:")
  391. for key, value in translation_stats.get_stats().items():
  392. print(f"{key}: {value}")
  393. except KeyboardInterrupt:
  394. print("\n检测到中断,保存当前进度...")
  395. if 'processed_lines' in locals():
  396. update_file_progress(conn, file_path, total_lines, processed_lines, 'interrupted')
  397. # 显示中断时的统计信息
  398. print("\n中断时的统计信息:")
  399. for key, value in translation_stats.get_stats().items():
  400. print(f"{key}: {value}")
  401. raise
  402. except Exception as e:
  403. print(f"处理文件时出错: {str(e)}")
  404. if 'processed_lines' in locals():
  405. update_file_progress(conn, file_path, total_lines, processed_lines, 'error')
  406. raise
  407. except Exception as e:
  408. print(f"读取文件时出错: {str(e)}")
  409. return
  410. def main():
  411. ops_dir = "002/Ops"
  412. html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
  413. print(f"找到 {len(html_files)} 个HTML文件需要处理")
  414. print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  415. # 初始化数据库连接
  416. conn = init_db()
  417. try:
  418. for filename in tqdm(html_files, desc="处理文件", unit="文件"):
  419. file_path = os.path.join(ops_dir, filename)
  420. process_html_file(file_path, conn)
  421. except KeyboardInterrupt:
  422. print("\n程序被用户中断")
  423. finally:
  424. conn.close()
  425. print(f"\n结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  426. print("\n最终统计信息:")
  427. for key, value in translation_stats.get_stats().items():
  428. print(f"{key}: {value}")
  429. if __name__ == "__main__":
  430. main()