feat: integrate Aliyun Tingwu ASR (WIP: needs OSS upload)

2026-02-17 12:23:31 +08:00
parent e85038a1fb
commit 32df5d3303
2 changed files with 54 additions and 52 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -1,21 +1,24 @@
 #!/usr/bin/env python3
 """
-InsightFlow Backend - Phase 1 MVP with Deepgram
+InsightFlow Backend - Phase 1 MVP with 阿里听悟
-ASR: Deepgram (Nova-3)
+ASR: 阿里云听悟 (TingWu)
-Speaker Diarization: Deepgram
+Speaker Diarization: 听悟内置
 LLM: Kimi API for entity extraction
 """
 import os
 import json
 import httpx
-from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+import time
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import List, Optional
 from datetime import datetime
-from deepgram import DeepgramClient, PrerecordedOptions, FileSource
+from alibabacloud_tingwu20230930 import models as tingwu_models
 from alibabacloud_tingwu20230930.client import Client as TingwuClient
 from alibabacloud_tea_openapi import models as open_api_models
 app = FastAPI(title="InsightFlow", version="0.1.0")
@@ -52,52 +55,52 @@ class AnalysisResult(BaseModel):
 storage = {}
 # API Keys
-DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
+ALI_ACCESS_KEY = os.getenv("ALI_ACCESS_KEY", "")
 ALI_SECRET_KEY = os.getenv("ALI_SECRET_KEY", "")
 KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
 KIMI_BASE_URL = "https://api.kimi.com/coding"
-def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
+def create_tingwu_client():
-    """使用 Deepgram 进行转录和说话人分离"""
+    """创建听悟客户端"""
-    if not DEEPGRAM_API_KEY:
+    config = open_api_models.Config(
-        raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
+        access_key_id=ALI_ACCESS_KEY,
        access_key_secret=ALI_SECRET_KEY
    )
    config.endpoint = "tingwu.cn-beijing.aliyuncs.com"
    return TingwuClient(config)
 def transcribe_with_tingwu(audio_data: bytes, filename: str) -> dict:
    """使用阿里听悟进行转录和说话人分离"""
    if not ALI_ACCESS_KEY or not ALI_SECRET_KEY:
        raise HTTPException(status_code=500, detail="Aliyun credentials not configured")
-    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
+    client = create_tingwu_client()
-    payload: FileSource = {
+    # 1. 创建任务
-        "buffer": audio_data,
+    task_req = tingwu_models.CreateTaskRequest(
-        "mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
+        type="offline",
-    }
+        input=tingwu_models.Input(
-    
+            source="oss",  # 先上传到 OSS 或使用 URL
-    options = PrerecordedOptions(
+            file_url="",  # TODO: 需要 OSS 上传
-        model="nova-3",
+        ),
-        language="zh",
+        parameters=tingwu_models.Parameters(
-        smart_format=True,
+            transcription=tingwu_models.Transcription(
-        diarize=True,  # 说话人分离
+                diarization_enabled=True,
-        utterances=True,
+                sentence_max_length=20
-        punctuate=True,
+            ),
-        paragraphs=True
+            summarization=tingwu_models.Summarization(enabled=False)
        )
    )
-    response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
+    # 简化：先用 HTTP 方式调用
-    
+    # 实际生产需要 OSS 上传或 URL
    # 解析结果
    result = response.results
    full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
    # 提取带说话人的片段
    segments = []
    if result.utterances:
        for u in result.utterances:
            segments.append({
                "start": u.start,
                "end": u.end,
                "text": u.transcript,
                "speaker": f"Speaker {u.speaker}"
            })
    # Mock 结果用于测试
    return {
-        "full_text": full_text,
+        "full_text": "这是一个示例转录文本，包含 Project Alpha 和 K8s 等术语。",
-        "segments": segments
+        "segments": [
            {"start": 0.0, "end": 5.0, "text": "这是一个示例转录文本，包含 Project Alpha 和 K8s 等术语。", "speaker": "Speaker A"}
        ]
    }
 def extract_entities_with_llm(text: str) -> List[Entity]:
@@ -107,7 +110,7 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
    prompt = f"""请从以下会议文本中提取关键实体（专有名词、项目名、技术术语、人名等），并以 JSON 格式返回：
-文本：{text[:3000]}  # 限制长度避免超限
+文本：{text[:3000]}
 要求：
 1. 每个实体包含：name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
@@ -136,7 +139,6 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
        result = response.json()
        content = result["choices"][0]["message"]["content"]
        # 解析 JSON
        import re
        json_match = re.search(r'\[.*?\]', content, re.DOTALL)
        if json_match:
@@ -162,24 +164,24 @@ async def upload_audio(file: UploadFile = File(...)):
    """上传音频并分析"""
    content = await file.read()
-    # Deepgram 转录
+    # 听悟转录
-    print(f"Transcribing with Deepgram: {file.filename}")
+    print(f"Transcribing with Tingwu: {file.filename}")
-    dg_result = transcribe_with_deepgram(content, file.filename)
+    tw_result = transcribe_with_tingwu(content, file.filename)
    # 构建片段
    segments = [
-        TranscriptSegment(**seg) for seg in dg_result["segments"]
+        TranscriptSegment(**seg) for seg in tw_result["segments"]
-    ] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
+    ] or [TranscriptSegment(start=0, end=0, text=tw_result["full_text"], speaker="Speaker A")]
    # LLM 实体提取
    print("Extracting entities with LLM...")
-    entities = extract_entities_with_llm(dg_result["full_text"])
+    entities = extract_entities_with_llm(tw_result["full_text"])
    analysis = AnalysisResult(
        transcript_id=os.urandom(8).hex(),
        segments=segments,
        entities=entities,
-        full_text=dg_result["full_text"],
+        full_text=tw_result["full_text"],
        created_at=datetime.now().isoformat()
    )
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -1,7 +1,7 @@
 fastapi==0.115.0
 uvicorn[standard]==0.32.0
 python-multipart==0.0.17
-deepgram-sdk==3.7.0
+alibabacloud_tingwu20230930==2.0.2
 httpx==0.27.2
 pydantic==2.9.2
 python-dotenv==1.0.1