瀏覽代碼

feat: 添加翻译文件清理功能并优化错误处理 (#1219)

- 新增cleanup_deleted_source_files方法用于清理源文件已删除的翻译文件
- 为_remove_thinking和_translate_chunk方法添加Optional类型支持
- 改进错误日志输出格式
- 添加命令行参数支持清理操作和预览模式

Signed-off-by: longjin <longjin@DragonOS.org>
LoGin 18 小時之前
父節點
當前提交
2532cb74fb
共有 1 個文件被更改,包括 81 次插入9 次删除
  1. 81 9
      tools/doc_translator.py

+ 81 - 9
tools/doc_translator.py

@@ -24,7 +24,7 @@ import json
 from pathlib import Path
 import sys
 import threading
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Optional
 import openai
 import datetime
 import time
@@ -241,18 +241,20 @@ class DocumentTranslator:
 
         return text
 
-    def _remove_thinking(self, text: str) -> str:
+    def _remove_thinking(self, text: Optional[str]) -> str:
         """Remove <think> tags from text"""
+        if text is None:
+            return ""
         return re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
 
-    def _translate_chunk(self, args: Tuple[str, str]) -> str:
+    def _translate_chunk(self, args: Tuple[str, str]) -> Optional[str]:
         """翻译单个文本块(内部方法,用于并行处理)"""
         chunk, lang = args
         retry = 3
         while retry > 0:
             try:
                 lang_name = CONFIG["target_languages"].get(lang, "English")
-                prompt = f"你是一个专业的文档翻译助手,请将以下中文技术文档准确翻译成{lang_name},保持技术术语的正确性和格式不变"
+                prompt = f"你是一个专业的文档翻译助手,请将以下中文技术文档准确翻译成{lang_name},保持技术术语的正确性和格式不变."
 
                 # disable qwen3's thinking mode
                 if "qwen3" in CONFIG["model"].lower():
@@ -273,11 +275,12 @@ class DocumentTranslator:
             except Exception as e:
                 retry -= 1
                 if retry == 0:
-                    print("翻译失败: {e},放弃重试。")
+                    print(f"翻译失败: {e},放弃重试。")
                     return None
 
                 print(f"翻译出错: {e}, retrying... ({retry})")
                 time.sleep(2)
+        return None
 
     def translate_text(self, text: str, lang: str) -> str:
         """使用openai接口翻译文本
@@ -396,9 +399,65 @@ class DocumentTranslator:
 
         print(f"标题已添加到 {lang_root_doc_path}")
 
-    def run(self):
-        """运行翻译流程"""
-
+    def cleanup_deleted_source_files(self, dry_run=False):
+        """清理源文件已删除的翻译文件
+        Args:
+            dry_run: 如果为True,仅预览而不实际执行删除操作
+        """
+        print(f"Checking for deleted source files (dry_run={dry_run})...")
+        cleaned_count = 0
+        log_file = os.path.join(CONFIG["source_dir"], "cleanup.log")
+        log_entries = []
+        
+        for lang in CONFIG["target_languages"]:
+            lang_dir = os.path.join(CONFIG["source_dir"], "locales", lang)
+            if not os.path.exists(lang_dir):
+                continue
+                
+            for root, _, files in os.walk(lang_dir):
+                for file in files:
+                    if not file.endswith((".rst", ".md")):
+                        continue
+                        
+                    # 获取翻译文件相对路径
+                    trans_rel_path = os.path.relpath(os.path.join(root, file), lang_dir)
+                    # 对应的源文件路径
+                    source_path = os.path.join(CONFIG["source_dir"], trans_rel_path)
+                    
+                    if not os.path.exists(source_path):
+                        # 源文件不存在,处理翻译文件
+                        trans_file = os.path.join(root, file)
+                        log_entry = f"[{'DRY RUN' if dry_run else 'DELETED'}] {trans_rel_path}"
+                        log_entries.append(log_entry)
+                        print(log_entry)
+                        
+                        if not dry_run:
+                            os.remove(trans_file)
+                            # 从缓存中移除
+                            cache_key = self._get_cache_key(trans_rel_path, lang)
+                            with self._cache_lock:
+                                if cache_key in self._cache:
+                                    del self._cache[cache_key]
+                                    self._save_cache()
+                        cleaned_count += 1
+        
+        if cleaned_count > 0:
+            if not dry_run:
+                self._save_cache()
+                # 写入日志文件
+                with open(log_file, "a", encoding="utf-8") as f:
+                    f.write("\n".join(log_entries) + "\n")
+            print(f"Found {cleaned_count} orphaned translation files {'(dry run)' if dry_run else ''}")
+        else:
+            print("No orphaned translation files found")
+
+    def run(self, dry_run=False):
+        """运行翻译流程
+        Args:
+            dry_run: 如果为True,仅预览清理操作而不实际执行
+        """
+        self.cleanup_deleted_source_files(dry_run=dry_run)
+        
         print("Collecting all files...")
         all_files = []
         for root, dirs, files in os.walk(CONFIG["source_dir"], topdown=True):
@@ -459,6 +518,15 @@ class DocumentTranslator:
 
 
 if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='DragonOS文档翻译工具')
+    parser.add_argument('--cleanup-only', action='store_true', 
+                       help='仅执行清理操作,不进行翻译')
+    parser.add_argument('--dry-run', action='store_true',
+                       help='预览清理操作而不实际执行')
+    args = parser.parse_args()
+
     print("Starting translation process...")
     print("WORKERS: ", CONFIG["max_workers"])
     print("LANGUAGES: ", CONFIG["target_languages"])
@@ -466,7 +534,11 @@ if __name__ == "__main__":
     print("MODEL: ", CONFIG["model"])
 
     translator = DocumentTranslator()
-    translator.run()
+    
+    if args.cleanup_only:
+        translator.cleanup_deleted_source_files(dry_run=args.dry_run)
+    else:
+        translator.run(dry_run=args.dry_run)
     
     if translator.fail_count > 0:
         sys.exit(1)