feat: integrate Aliyun Tingwu ASR (WIP: needs OSS upload)

This commit is contained in:
OpenClaw Bot
2026-02-17 12:23:31 +08:00
parent e85038a1fb
commit 32df5d3303
2 changed files with 54 additions and 52 deletions

View File

@@ -1,21 +1,24 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
InsightFlow Backend - Phase 1 MVP with Deepgram InsightFlow Backend - Phase 1 MVP with 阿里听悟
ASR: Deepgram (Nova-3) ASR: 阿里云听悟 (TingWu)
Speaker Diarization: Deepgram Speaker Diarization: 听悟内置
LLM: Kimi API for entity extraction LLM: Kimi API for entity extraction
""" """
import os import os
import json import json
import httpx import httpx
from fastapi import FastAPI, File, UploadFile, HTTPException, Form import time
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel from pydantic import BaseModel
from typing import List, Optional from typing import List, Optional
from datetime import datetime from datetime import datetime
from deepgram import DeepgramClient, PrerecordedOptions, FileSource from alibabacloud_tingwu20230930 import models as tingwu_models
from alibabacloud_tingwu20230930.client import Client as TingwuClient
from alibabacloud_tea_openapi import models as open_api_models
app = FastAPI(title="InsightFlow", version="0.1.0") app = FastAPI(title="InsightFlow", version="0.1.0")
@@ -52,52 +55,52 @@ class AnalysisResult(BaseModel):
storage = {} storage = {}
# API Keys # API Keys
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "") ALI_ACCESS_KEY = os.getenv("ALI_ACCESS_KEY", "")
ALI_SECRET_KEY = os.getenv("ALI_SECRET_KEY", "")
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "") KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
KIMI_BASE_URL = "https://api.kimi.com/coding" KIMI_BASE_URL = "https://api.kimi.com/coding"
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict: def create_tingwu_client():
"""使用 Deepgram 进行转录和说话人分离""" """创建听悟客户端"""
if not DEEPGRAM_API_KEY: config = open_api_models.Config(
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured") access_key_id=ALI_ACCESS_KEY,
access_key_secret=ALI_SECRET_KEY
)
config.endpoint = "tingwu.cn-beijing.aliyuncs.com"
return TingwuClient(config)
deepgram = DeepgramClient(DEEPGRAM_API_KEY) def transcribe_with_tingwu(audio_data: bytes, filename: str) -> dict:
"""使用阿里听悟进行转录和说话人分离"""
if not ALI_ACCESS_KEY or not ALI_SECRET_KEY:
raise HTTPException(status_code=500, detail="Aliyun credentials not configured")
payload: FileSource = { client = create_tingwu_client()
"buffer": audio_data,
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
}
options = PrerecordedOptions( # 1. 创建任务
model="nova-3", task_req = tingwu_models.CreateTaskRequest(
language="zh", type="offline",
smart_format=True, input=tingwu_models.Input(
diarize=True, # 说话人分离 source="oss", # 先上传到 OSS 或使用 URL
utterances=True, file_url="", # TODO: 需要 OSS 上传
punctuate=True, ),
paragraphs=True parameters=tingwu_models.Parameters(
transcription=tingwu_models.Transcription(
diarization_enabled=True,
sentence_max_length=20
),
summarization=tingwu_models.Summarization(enabled=False)
)
) )
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options) # 简化:先用 HTTP 方式调用
# 实际生产需要 OSS 上传或 URL
# 解析结果
result = response.results
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
# 提取带说话人的片段
segments = []
if result.utterances:
for u in result.utterances:
segments.append({
"start": u.start,
"end": u.end,
"text": u.transcript,
"speaker": f"Speaker {u.speaker}"
})
# Mock 结果用于测试
return { return {
"full_text": full_text, "full_text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语。",
"segments": segments "segments": [
{"start": 0.0, "end": 5.0, "text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语。", "speaker": "Speaker A"}
]
} }
def extract_entities_with_llm(text: str) -> List[Entity]: def extract_entities_with_llm(text: str) -> List[Entity]:
@@ -107,7 +110,7 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回: prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
文本:{text[:3000]} # 限制长度避免超限 文本:{text[:3000]}
要求: 要求:
1. 每个实体包含name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义) 1. 每个实体包含name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
@@ -136,7 +139,6 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
result = response.json() result = response.json()
content = result["choices"][0]["message"]["content"] content = result["choices"][0]["message"]["content"]
# 解析 JSON
import re import re
json_match = re.search(r'\[.*?\]', content, re.DOTALL) json_match = re.search(r'\[.*?\]', content, re.DOTALL)
if json_match: if json_match:
@@ -162,24 +164,24 @@ async def upload_audio(file: UploadFile = File(...)):
"""上传音频并分析""" """上传音频并分析"""
content = await file.read() content = await file.read()
# Deepgram 转录 # 听悟转录
print(f"Transcribing with Deepgram: {file.filename}") print(f"Transcribing with Tingwu: {file.filename}")
dg_result = transcribe_with_deepgram(content, file.filename) tw_result = transcribe_with_tingwu(content, file.filename)
# 构建片段 # 构建片段
segments = [ segments = [
TranscriptSegment(**seg) for seg in dg_result["segments"] TranscriptSegment(**seg) for seg in tw_result["segments"]
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")] ] or [TranscriptSegment(start=0, end=0, text=tw_result["full_text"], speaker="Speaker A")]
# LLM 实体提取 # LLM 实体提取
print("Extracting entities with LLM...") print("Extracting entities with LLM...")
entities = extract_entities_with_llm(dg_result["full_text"]) entities = extract_entities_with_llm(tw_result["full_text"])
analysis = AnalysisResult( analysis = AnalysisResult(
transcript_id=os.urandom(8).hex(), transcript_id=os.urandom(8).hex(),
segments=segments, segments=segments,
entities=entities, entities=entities,
full_text=dg_result["full_text"], full_text=tw_result["full_text"],
created_at=datetime.now().isoformat() created_at=datetime.now().isoformat()
) )

View File

@@ -1,7 +1,7 @@
fastapi==0.115.0 fastapi==0.115.0
uvicorn[standard]==0.32.0 uvicorn[standard]==0.32.0
python-multipart==0.0.17 python-multipart==0.0.17
deepgram-sdk==3.7.0 alibabacloud_tingwu20230930==2.0.2
httpx==0.27.2 httpx==0.27.2
pydantic==2.9.2 pydantic==2.9.2
python-dotenv==1.0.1 python-dotenv==1.0.1