feat: integrate Aliyun Tingwu ASR (WIP: needs OSS upload)

2026-02-17 12:23:31 +08:00
parent e85038a1fb
commit 32df5d3303
2 changed files with 54 additions and 52 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -1,21 +1,24 @@
 #!/usr/bin/env python3
 """
-InsightFlow Backend - Phase 1 MVP with Deepgram
-ASR: Deepgram (Nova-3)
-Speaker Diarization: Deepgram
+InsightFlow Backend - Phase 1 MVP with 阿里听悟
+ASR: 阿里云听悟 (TingWu)
+Speaker Diarization: 听悟内置
 LLM: Kimi API for entity extraction
 """

 import os
 import json
 import httpx
-from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+import time
+from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import List, Optional
 from datetime import datetime
-from deepgram import DeepgramClient, PrerecordedOptions, FileSource
+from alibabacloud_tingwu20230930 import models as tingwu_models
+from alibabacloud_tingwu20230930.client import Client as TingwuClient
+from alibabacloud_tea_openapi import models as open_api_models

 app = FastAPI(title="InsightFlow", version="0.1.0")

@@ -52,52 +55,52 @@ class AnalysisResult(BaseModel):
 storage = {}

 # API Keys
-DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
+ALI_ACCESS_KEY = os.getenv("ALI_ACCESS_KEY", "")
+ALI_SECRET_KEY = os.getenv("ALI_SECRET_KEY", "")
 KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
 KIMI_BASE_URL = "https://api.kimi.com/coding"

-def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
-    """使用 Deepgram 进行转录和说话人分离"""
-    if not DEEPGRAM_API_KEY:
-        raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
+def create_tingwu_client():
+    """创建听悟客户端"""
+    config = open_api_models.Config(
+        access_key_id=ALI_ACCESS_KEY,
+        access_key_secret=ALI_SECRET_KEY
+    )
+    config.endpoint = "tingwu.cn-beijing.aliyuncs.com"
+    return TingwuClient(config)
+
+def transcribe_with_tingwu(audio_data: bytes, filename: str) -> dict:
+    """使用阿里听悟进行转录和说话人分离"""
+    if not ALI_ACCESS_KEY or not ALI_SECRET_KEY:
+        raise HTTPException(status_code=500, detail="Aliyun credentials not configured")
    
-    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
+    client = create_tingwu_client()
    
-    payload: FileSource = {
-        "buffer": audio_data,
-        "mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
-    }
-    
-    options = PrerecordedOptions(
-        model="nova-3",
-        language="zh",
-        smart_format=True,
-        diarize=True,  # 说话人分离
-        utterances=True,
-        punctuate=True,
-        paragraphs=True
+    # 1. 创建任务
+    task_req = tingwu_models.CreateTaskRequest(
+        type="offline",
+        input=tingwu_models.Input(
+            source="oss",  # 先上传到 OSS 或使用 URL
+            file_url="",  # TODO: 需要 OSS 上传
+        ),
+        parameters=tingwu_models.Parameters(
+            transcription=tingwu_models.Transcription(
+                diarization_enabled=True,
+                sentence_max_length=20
+            ),
+            summarization=tingwu_models.Summarization(enabled=False)
+        )
    )
    
-    response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
-    
-    # 解析结果
-    result = response.results
-    full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
-    
-    # 提取带说话人的片段
-    segments = []
-    if result.utterances:
-        for u in result.utterances:
-            segments.append({
-                "start": u.start,
-                "end": u.end,
-                "text": u.transcript,
-                "speaker": f"Speaker {u.speaker}"
-            })
+    # 简化：先用 HTTP 方式调用
+    # 实际生产需要 OSS 上传或 URL
    
+    # Mock 结果用于测试
    return {
-        "full_text": full_text,
-        "segments": segments
+        "full_text": "这是一个示例转录文本，包含 Project Alpha 和 K8s 等术语。",
+        "segments": [
+            {"start": 0.0, "end": 5.0, "text": "这是一个示例转录文本，包含 Project Alpha 和 K8s 等术语。", "speaker": "Speaker A"}
+        ]
    }

 def extract_entities_with_llm(text: str) -> List[Entity]:
@@ -107,7 +110,7 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
    
    prompt = f"""请从以下会议文本中提取关键实体（专有名词、项目名、技术术语、人名等），并以 JSON 格式返回：

-文本：{text[:3000]}  # 限制长度避免超限
+文本：{text[:3000]}

 要求：
 1. 每个实体包含：name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
@@ -136,7 +139,6 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
        result = response.json()
        content = result["choices"][0]["message"]["content"]
        
-        # 解析 JSON
        import re
        json_match = re.search(r'\[.*?\]', content, re.DOTALL)
        if json_match:
@@ -162,24 +164,24 @@ async def upload_audio(file: UploadFile = File(...)):
    """上传音频并分析"""
    content = await file.read()
    
-    # Deepgram 转录
-    print(f"Transcribing with Deepgram: {file.filename}")
-    dg_result = transcribe_with_deepgram(content, file.filename)
+    # 听悟转录
+    print(f"Transcribing with Tingwu: {file.filename}")
+    tw_result = transcribe_with_tingwu(content, file.filename)
    
    # 构建片段
    segments = [
-        TranscriptSegment(**seg) for seg in dg_result["segments"]
-    ] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
+        TranscriptSegment(**seg) for seg in tw_result["segments"]
+    ] or [TranscriptSegment(start=0, end=0, text=tw_result["full_text"], speaker="Speaker A")]
    
    # LLM 实体提取
    print("Extracting entities with LLM...")
-    entities = extract_entities_with_llm(dg_result["full_text"])
+    entities = extract_entities_with_llm(tw_result["full_text"])
    
    analysis = AnalysisResult(
        transcript_id=os.urandom(8).hex(),
        segments=segments,
        entities=entities,
-        full_text=dg_result["full_text"],
+        full_text=tw_result["full_text"],
        created_at=datetime.now().isoformat()
    )
    
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -1,7 +1,7 @@
 fastapi==0.115.0
 uvicorn[standard]==0.32.0
 python-multipart==0.0.17
-deepgram-sdk==3.7.0
+alibabacloud_tingwu20230930==2.0.2
 httpx==0.27.2
 pydantic==2.9.2
 python-dotenv==1.0.1