translate_epub_v1.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. import os
  2. import re
  3. from bs4 import BeautifulSoup
  4. import openai
  5. import time
  6. from tqdm import tqdm
  7. import sqlite3
  8. import json
  9. # 初始化OpenAI客户端
  10. client = openai.OpenAI(
  11. # chatnio
  12. # base_url="https://api.chatnio.net/v1",
  13. # api_key="sk-"
  14. # deepseek
  15. # base_url="https://api.deepseek.com/v1",
  16. # api_key="sk-"
  17. # Qwen/Qwen3-32B
  18. base_url="https://api.siliconflow.cn/v1",
  19. api_key="sk-"
  20. )
  21. # model_name = "Qwen/Qwen3-32B" # Qwen/Qwen3-32B
  22. model_name = "deepseek-ai/DeepSeek-R1" # deepseek-ai/DeepSeek-R1
  23. # 添加版本控制
  24. VERSION = "1.0.1" # 版本号,用于区分不同版本的翻译
  25. line_count = 2 # 每组行数,越大越快,但越容易出错
  26. # 自动调整参数
  27. MIN_LINE_COUNT = 1
  28. MAX_LINE_COUNT = 5
  29. INITIAL_LINE_COUNT = 2
  30. ERROR_THRESHOLD = 3 # 连续错误次数阈值
  31. SUCCESS_THRESHOLD = 5 # 连续成功次数阈值
  32. class LineCountManager:
  33. def __init__(self):
  34. self.current_line_count = INITIAL_LINE_COUNT
  35. self.consecutive_errors = 0
  36. self.consecutive_successes = 0
  37. self.last_error_time = None
  38. self.error_cooldown = 60 # 错误冷却时间(秒)
  39. self.version = f"1.0.{INITIAL_LINE_COUNT}" # 初始版本号
  40. def adjust_line_count(self, success):
  41. current_time = time.time()
  42. # 检查是否在冷却期内
  43. if self.last_error_time and (current_time - self.last_error_time) < self.error_cooldown:
  44. return self.current_line_count
  45. if success:
  46. self.consecutive_errors = 0
  47. self.consecutive_successes += 1
  48. # 如果连续成功次数达到阈值,尝试增加行数
  49. if self.consecutive_successes >= SUCCESS_THRESHOLD:
  50. if self.current_line_count < MAX_LINE_COUNT:
  51. self.current_line_count += 1
  52. self.consecutive_successes = 0
  53. self.version = f"1.0.{self.current_line_count}" # 更新版本号
  54. print(f"翻译连续成功,增加行数到 {self.current_line_count},版本更新为 {self.version}")
  55. else:
  56. self.consecutive_successes = 0
  57. self.consecutive_errors += 1
  58. self.last_error_time = current_time
  59. # 如果连续错误次数达到阈值,减少行数
  60. if self.consecutive_errors >= ERROR_THRESHOLD:
  61. if self.current_line_count > MIN_LINE_COUNT:
  62. self.current_line_count -= 1
  63. self.consecutive_errors = 0
  64. self.version = f"1.0.{self.current_line_count}" # 更新版本号
  65. print(f"翻译连续失败,减少行数到 {self.current_line_count},版本更新为 {self.version}")
  66. return self.current_line_count
  67. # 创建全局的LineCountManager实例
  68. line_count_manager = LineCountManager()
  69. def init_db():
  70. """初始化数据库"""
  71. conn = sqlite3.connect('translation_progress.db')
  72. c = conn.cursor()
  73. # 检查是否需要迁移数据库
  74. try:
  75. c.execute("SELECT version FROM file_progress LIMIT 1")
  76. except sqlite3.OperationalError:
  77. # 如果表不存在或没有version字段,进行迁移
  78. print("正在更新数据库结构...")
  79. # 备份旧表
  80. c.execute("ALTER TABLE file_progress RENAME TO file_progress_old")
  81. c.execute("ALTER TABLE group_progress RENAME TO group_progress_old")
  82. # 创建新表
  83. c.execute('''
  84. CREATE TABLE IF NOT EXISTS file_progress (
  85. file_path TEXT PRIMARY KEY,
  86. total_lines INTEGER,
  87. processed_lines INTEGER,
  88. status TEXT,
  89. version TEXT,
  90. last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
  91. )
  92. ''')
  93. c.execute('''
  94. CREATE TABLE IF NOT EXISTS group_progress (
  95. id INTEGER PRIMARY KEY AUTOINCREMENT,
  96. file_path TEXT,
  97. group_index INTEGER,
  98. original_text TEXT,
  99. translated_text TEXT,
  100. status TEXT,
  101. version TEXT,
  102. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  103. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  104. UNIQUE(file_path, group_index, version)
  105. )
  106. ''')
  107. # 迁移数据
  108. try:
  109. c.execute('''
  110. INSERT INTO file_progress
  111. (file_path, total_lines, processed_lines, status, version, last_updated)
  112. SELECT file_path, total_lines, processed_lines, status, ?, last_updated
  113. FROM file_progress_old
  114. ''', (line_count_manager.version,))
  115. c.execute('''
  116. INSERT INTO group_progress
  117. (file_path, group_index, original_text, translated_text, status, version, created_at, updated_at)
  118. SELECT file_path, group_index, original_text, translated_text, status, ?, created_at, updated_at
  119. FROM group_progress_old
  120. ''', (line_count_manager.version,))
  121. # 删除旧表
  122. c.execute("DROP TABLE file_progress_old")
  123. c.execute("DROP TABLE group_progress_old")
  124. print("数据库迁移完成")
  125. except sqlite3.OperationalError as e:
  126. print(f"迁移数据时出错: {str(e)}")
  127. # 如果迁移失败,回滚到原始表
  128. c.execute("DROP TABLE IF EXISTS file_progress")
  129. c.execute("DROP TABLE IF EXISTS group_progress")
  130. c.execute("ALTER TABLE file_progress_old RENAME TO file_progress")
  131. c.execute("ALTER TABLE group_progress_old RENAME TO group_progress")
  132. raise
  133. else:
  134. # 如果表已存在且包含version字段,创建新表
  135. c.execute('''
  136. CREATE TABLE IF NOT EXISTS file_progress (
  137. file_path TEXT PRIMARY KEY,
  138. total_lines INTEGER,
  139. processed_lines INTEGER,
  140. status TEXT,
  141. version TEXT,
  142. last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
  143. )
  144. ''')
  145. c.execute('''
  146. CREATE TABLE IF NOT EXISTS group_progress (
  147. id INTEGER PRIMARY KEY AUTOINCREMENT,
  148. file_path TEXT,
  149. group_index INTEGER,
  150. original_text TEXT,
  151. translated_text TEXT,
  152. status TEXT,
  153. version TEXT,
  154. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  155. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  156. UNIQUE(file_path, group_index, version)
  157. )
  158. ''')
  159. conn.commit()
  160. return conn
  161. def get_file_progress(conn, file_path):
  162. """获取文件翻译进度"""
  163. c = conn.cursor()
  164. c.execute('SELECT * FROM file_progress WHERE file_path = ?', (file_path,))
  165. return c.fetchone()
  166. def update_file_progress(conn, file_path, total_lines, processed_lines, status):
  167. """更新文件翻译进度"""
  168. c = conn.cursor()
  169. c.execute('''
  170. INSERT OR REPLACE INTO file_progress
  171. (file_path, total_lines, processed_lines, status, version, last_updated)
  172. VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  173. ''', (file_path, total_lines, processed_lines, status, line_count_manager.version))
  174. conn.commit()
  175. def get_group_progress(conn, file_path, group_index):
  176. """获取翻译组进度"""
  177. c = conn.cursor()
  178. c.execute('''
  179. SELECT * FROM group_progress
  180. WHERE file_path = ? AND group_index = ?
  181. ''', (file_path, group_index))
  182. return c.fetchone()
  183. def update_group_progress(conn, file_path, group_index, original_text, translated_text, status):
  184. """更新翻译组进度"""
  185. c = conn.cursor()
  186. c.execute('''
  187. INSERT OR REPLACE INTO group_progress
  188. (file_path, group_index, original_text, translated_text, status, version, updated_at)
  189. VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
  190. ''', (file_path, group_index, original_text, translated_text, status, line_count_manager.version))
  191. conn.commit()
  192. def get_completed_groups(conn, file_path):
  193. """获取已完成的翻译组"""
  194. c = conn.cursor()
  195. c.execute('''
  196. SELECT group_index, translated_text
  197. FROM group_progress
  198. WHERE file_path = ? AND status = 'completed' AND version = ?
  199. ORDER BY group_index
  200. ''', (file_path, line_count_manager.version))
  201. return c.fetchall()
  202. # """ - 输出内容要求用代码块包裹起来
  203. # ,只在必要时提供相应的语言注释
  204. # """
  205. def translate_text(text, max_retries=3):
  206. """翻译文本,添加重试机制"""
  207. for attempt in range(max_retries):
  208. try:
  209. messages = [
  210. {
  211. "role": "system",
  212. "content": "- 你名为epub翻译大师,专注于将任意语言的文本翻译成中文。- 你在翻译过程中,力求保留原文语意,确保翻译的准确性和完整性。- 你特别注重翻译结果要贴合现代人的阅读习惯,使译文更加流畅易懂。- 在处理包含代码结构的文本时,你会特别注意保持代码的原样。- 你的服务旨在为用户提供高效、便捷的翻译体验,帮助用户跨越语言障碍。- 在回答问题的时候,尽可能保留原来的代码结构。"
  213. },
  214. {
  215. "role": "user",
  216. "content": text
  217. }
  218. ]
  219. response = client.chat.completions.create(
  220. model=model_name,
  221. messages=messages
  222. )
  223. # 翻译成功,调整行数
  224. line_count_manager.adjust_line_count(True)
  225. return response.choices[0].message.content
  226. except Exception as e:
  227. if attempt == max_retries - 1:
  228. print(f"翻译失败,已达到最大重试次数: {str(e)}")
  229. # 翻译失败,调整行数
  230. line_count_manager.adjust_line_count(False)
  231. return text
  232. print(f"翻译出错,正在重试 ({attempt + 1}/{max_retries}): {str(e)}")
  233. time.sleep(2 ** attempt) # 指数退避
  234. def process_html_file(file_path, conn):
  235. """处理HTML文件"""
  236. # 检查文件进度
  237. progress = get_file_progress(conn, file_path)
  238. try:
  239. # 尝试不同的编码方式读取文件
  240. encodings = ['utf-8', 'gbk', 'gb2312', 'latin1']
  241. content = None
  242. for encoding in encodings:
  243. try:
  244. with open(file_path, 'r', encoding=encoding) as f:
  245. content = f.read()
  246. break
  247. except UnicodeDecodeError:
  248. continue
  249. if content is None:
  250. raise Exception(f"无法使用支持的编码读取文件: {file_path}")
  251. # 使用正则表达式提取body标签内的内容
  252. body_pattern = re.compile(r'<body[^>]*>(.*?)</body>', re.DOTALL)
  253. body_match = body_pattern.search(content)
  254. if not body_match:
  255. print(f"警告: {file_path} 中没有找到body标签")
  256. return
  257. body_content = body_match.group(1)
  258. # 按行分割内容,保留所有HTML标签行,但只翻译包含 <p class 的行
  259. lines = []
  260. for line in body_content.split('\n'):
  261. line = line.strip()
  262. if line and line.startswith('<'):
  263. lines.append(line)
  264. total_lines = len(lines)
  265. # 获取已完成的翻译组
  266. completed_groups = get_completed_groups(conn, file_path)
  267. completed_indices = {group[0] for group in completed_groups}
  268. # 计算已处理的进度
  269. if progress:
  270. print(f"文件 {file_path} 已处理进度: {progress[2]}/{progress[1]} 行 ({round(progress[2]*100/progress[1], 2)}%)")
  271. # 按组处理内容
  272. translated_lines = []
  273. try:
  274. for i in tqdm(range(0, len(lines), line_count_manager.current_line_count), desc=f"处理文件 {os.path.basename(file_path)}", unit="组"):
  275. group_index = i // line_count_manager.current_line_count
  276. # 检查是否已完成
  277. if group_index in completed_indices:
  278. # 使用已完成的翻译
  279. for group in completed_groups:
  280. if group[0] == group_index:
  281. translated_lines.extend(group[1].split('\n'))
  282. break
  283. continue
  284. group = lines[i:i+line_count_manager.current_line_count]
  285. if group:
  286. # 保存原始文本
  287. original_text = "\n".join(group)
  288. # 收集需要翻译的段落
  289. paragraphs_to_translate = []
  290. paragraph_indices = []
  291. for idx, line in enumerate(group):
  292. if '<p class' in line:
  293. paragraphs_to_translate.append(line)
  294. paragraph_indices.append(idx)
  295. # 如果有需要翻译的段落,进行翻译
  296. if paragraphs_to_translate:
  297. translated_paragraphs = []
  298. for paragraph in paragraphs_to_translate:
  299. translated_paragraph = translate_text(paragraph)
  300. translated_paragraphs.append(translated_paragraph)
  301. # 将翻译后的段落放回原位置
  302. translated_group = group.copy()
  303. for idx, translated in zip(paragraph_indices, translated_paragraphs):
  304. translated_group[idx] = translated
  305. else:
  306. translated_group = group
  307. translated_text = "\n".join(translated_group)
  308. # 更新翻译组进度
  309. update_group_progress(conn, file_path, group_index, original_text, translated_text, 'completed')
  310. # 分割翻译后的文本
  311. translated_lines.extend(translated_group)
  312. # 更新文件进度
  313. processed_lines = min((group_index + 1) * line_count_manager.current_line_count, total_lines)
  314. update_file_progress(conn, file_path, total_lines, processed_lines, 'in_progress')
  315. # 添加延迟以避免API限制
  316. time.sleep(0.5) # 添加适当的延迟
  317. # 替换原始内容
  318. if translated_lines:
  319. # 保持原始内容的顺序和结构
  320. new_body_content = body_content
  321. current_index = 0
  322. # 遍历原始内容,替换需要翻译的部分
  323. for line in body_content.split('\n'):
  324. line = line.strip()
  325. if line and line.startswith('<'):
  326. if '<p class' in line and current_index < len(translated_lines):
  327. # 替换翻译后的内容
  328. new_body_content = new_body_content.replace(line, translated_lines[current_index])
  329. current_index += 1
  330. else:
  331. # 保持原样
  332. continue
  333. new_content = content.replace(body_content, new_body_content)
  334. # 保存修改后的文件
  335. with open(file_path, 'w', encoding='utf-8') as f:
  336. f.write(new_content)
  337. # 更新完成状态
  338. update_file_progress(conn, file_path, total_lines, total_lines, 'completed')
  339. print(f"文件 {file_path} 翻译完成")
  340. except KeyboardInterrupt:
  341. print("\n检测到中断,保存当前进度...")
  342. if 'processed_lines' in locals():
  343. update_file_progress(conn, file_path, total_lines, processed_lines, 'interrupted')
  344. raise
  345. except Exception as e:
  346. print(f"处理文件时出错: {str(e)}")
  347. if 'processed_lines' in locals():
  348. update_file_progress(conn, file_path, total_lines, processed_lines, 'error')
  349. raise
  350. except Exception as e:
  351. print(f"读取文件时出错: {str(e)}")
  352. return
  353. def main():
  354. ops_dir = "002/Ops"
  355. html_files = [f for f in os.listdir(ops_dir) if f.endswith('.html')]
  356. print(f"找到 {len(html_files)} 个HTML文件需要处理")
  357. # 初始化数据库连接
  358. conn = init_db()
  359. try:
  360. for filename in tqdm(html_files, desc="处理文件", unit="文件"):
  361. file_path = os.path.join(ops_dir, filename)
  362. process_html_file(file_path, conn)
  363. except KeyboardInterrupt:
  364. print("\n程序被用户中断")
  365. finally:
  366. conn.close()
  367. if __name__ == "__main__":
  368. main()