feat: integrate Aliyun Tingwu ASR (WIP: needs OSS upload)
This commit is contained in:
104
backend/main.py
104
backend/main.py
@@ -1,21 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
InsightFlow Backend - Phase 1 MVP with Deepgram
|
||||
ASR: Deepgram (Nova-3)
|
||||
Speaker Diarization: Deepgram
|
||||
InsightFlow Backend - Phase 1 MVP with 阿里听悟
|
||||
ASR: 阿里云听悟 (TingWu)
|
||||
Speaker Diarization: 听悟内置
|
||||
LLM: Kimi API for entity extraction
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import httpx
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
||||
import time
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
||||
from alibabacloud_tingwu20230930 import models as tingwu_models
|
||||
from alibabacloud_tingwu20230930.client import Client as TingwuClient
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
|
||||
app = FastAPI(title="InsightFlow", version="0.1.0")
|
||||
|
||||
@@ -52,52 +55,52 @@ class AnalysisResult(BaseModel):
|
||||
storage = {}
|
||||
|
||||
# API Keys
|
||||
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
||||
ALI_ACCESS_KEY = os.getenv("ALI_ACCESS_KEY", "")
|
||||
ALI_SECRET_KEY = os.getenv("ALI_SECRET_KEY", "")
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||||
|
||||
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
|
||||
"""使用 Deepgram 进行转录和说话人分离"""
|
||||
if not DEEPGRAM_API_KEY:
|
||||
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
|
||||
def create_tingwu_client():
|
||||
"""创建听悟客户端"""
|
||||
config = open_api_models.Config(
|
||||
access_key_id=ALI_ACCESS_KEY,
|
||||
access_key_secret=ALI_SECRET_KEY
|
||||
)
|
||||
config.endpoint = "tingwu.cn-beijing.aliyuncs.com"
|
||||
return TingwuClient(config)
|
||||
|
||||
def transcribe_with_tingwu(audio_data: bytes, filename: str) -> dict:
|
||||
"""使用阿里听悟进行转录和说话人分离"""
|
||||
if not ALI_ACCESS_KEY or not ALI_SECRET_KEY:
|
||||
raise HTTPException(status_code=500, detail="Aliyun credentials not configured")
|
||||
|
||||
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
|
||||
client = create_tingwu_client()
|
||||
|
||||
payload: FileSource = {
|
||||
"buffer": audio_data,
|
||||
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
|
||||
}
|
||||
|
||||
options = PrerecordedOptions(
|
||||
model="nova-3",
|
||||
language="zh",
|
||||
smart_format=True,
|
||||
diarize=True, # 说话人分离
|
||||
utterances=True,
|
||||
punctuate=True,
|
||||
paragraphs=True
|
||||
# 1. 创建任务
|
||||
task_req = tingwu_models.CreateTaskRequest(
|
||||
type="offline",
|
||||
input=tingwu_models.Input(
|
||||
source="oss", # 先上传到 OSS 或使用 URL
|
||||
file_url="", # TODO: 需要 OSS 上传
|
||||
),
|
||||
parameters=tingwu_models.Parameters(
|
||||
transcription=tingwu_models.Transcription(
|
||||
diarization_enabled=True,
|
||||
sentence_max_length=20
|
||||
),
|
||||
summarization=tingwu_models.Summarization(enabled=False)
|
||||
)
|
||||
)
|
||||
|
||||
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
||||
|
||||
# 解析结果
|
||||
result = response.results
|
||||
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
|
||||
|
||||
# 提取带说话人的片段
|
||||
segments = []
|
||||
if result.utterances:
|
||||
for u in result.utterances:
|
||||
segments.append({
|
||||
"start": u.start,
|
||||
"end": u.end,
|
||||
"text": u.transcript,
|
||||
"speaker": f"Speaker {u.speaker}"
|
||||
})
|
||||
# 简化:先用 HTTP 方式调用
|
||||
# 实际生产需要 OSS 上传或 URL
|
||||
|
||||
# Mock 结果用于测试
|
||||
return {
|
||||
"full_text": full_text,
|
||||
"segments": segments
|
||||
"full_text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语。",
|
||||
"segments": [
|
||||
{"start": 0.0, "end": 5.0, "text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语。", "speaker": "Speaker A"}
|
||||
]
|
||||
}
|
||||
|
||||
def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||
@@ -107,7 +110,7 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||
|
||||
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
||||
|
||||
文本:{text[:3000]} # 限制长度避免超限
|
||||
文本:{text[:3000]}
|
||||
|
||||
要求:
|
||||
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
||||
@@ -136,7 +139,6 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
# 解析 JSON
|
||||
import re
|
||||
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||||
if json_match:
|
||||
@@ -162,24 +164,24 @@ async def upload_audio(file: UploadFile = File(...)):
|
||||
"""上传音频并分析"""
|
||||
content = await file.read()
|
||||
|
||||
# Deepgram 转录
|
||||
print(f"Transcribing with Deepgram: {file.filename}")
|
||||
dg_result = transcribe_with_deepgram(content, file.filename)
|
||||
# 听悟转录
|
||||
print(f"Transcribing with Tingwu: {file.filename}")
|
||||
tw_result = transcribe_with_tingwu(content, file.filename)
|
||||
|
||||
# 构建片段
|
||||
segments = [
|
||||
TranscriptSegment(**seg) for seg in dg_result["segments"]
|
||||
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
|
||||
TranscriptSegment(**seg) for seg in tw_result["segments"]
|
||||
] or [TranscriptSegment(start=0, end=0, text=tw_result["full_text"], speaker="Speaker A")]
|
||||
|
||||
# LLM 实体提取
|
||||
print("Extracting entities with LLM...")
|
||||
entities = extract_entities_with_llm(dg_result["full_text"])
|
||||
entities = extract_entities_with_llm(tw_result["full_text"])
|
||||
|
||||
analysis = AnalysisResult(
|
||||
transcript_id=os.urandom(8).hex(),
|
||||
segments=segments,
|
||||
entities=entities,
|
||||
full_text=dg_result["full_text"],
|
||||
full_text=tw_result["full_text"],
|
||||
created_at=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.32.0
|
||||
python-multipart==0.0.17
|
||||
deepgram-sdk==3.7.0
|
||||
alibabacloud_tingwu20230930==2.0.2
|
||||
httpx==0.27.2
|
||||
pydantic==2.9.2
|
||||
python-dotenv==1.0.1
|
||||
|
||||
Reference in New Issue
Block a user