feat: integrate Aliyun Tingwu ASR (WIP: needs OSS upload)
This commit is contained in:
104
backend/main.py
104
backend/main.py
@@ -1,21 +1,24 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
InsightFlow Backend - Phase 1 MVP with Deepgram
|
InsightFlow Backend - Phase 1 MVP with 阿里听悟
|
||||||
ASR: Deepgram (Nova-3)
|
ASR: 阿里云听悟 (TingWu)
|
||||||
Speaker Diarization: Deepgram
|
Speaker Diarization: 听悟内置
|
||||||
LLM: Kimi API for entity extraction
|
LLM: Kimi API for entity extraction
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import httpx
|
import httpx
|
||||||
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
import time
|
||||||
|
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
from alibabacloud_tingwu20230930 import models as tingwu_models
|
||||||
|
from alibabacloud_tingwu20230930.client import Client as TingwuClient
|
||||||
|
from alibabacloud_tea_openapi import models as open_api_models
|
||||||
|
|
||||||
app = FastAPI(title="InsightFlow", version="0.1.0")
|
app = FastAPI(title="InsightFlow", version="0.1.0")
|
||||||
|
|
||||||
@@ -52,52 +55,52 @@ class AnalysisResult(BaseModel):
|
|||||||
storage = {}
|
storage = {}
|
||||||
|
|
||||||
# API Keys
|
# API Keys
|
||||||
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
ALI_ACCESS_KEY = os.getenv("ALI_ACCESS_KEY", "")
|
||||||
|
ALI_SECRET_KEY = os.getenv("ALI_SECRET_KEY", "")
|
||||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||||
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||||||
|
|
||||||
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
|
def create_tingwu_client():
|
||||||
"""使用 Deepgram 进行转录和说话人分离"""
|
"""创建听悟客户端"""
|
||||||
if not DEEPGRAM_API_KEY:
|
config = open_api_models.Config(
|
||||||
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
|
access_key_id=ALI_ACCESS_KEY,
|
||||||
|
access_key_secret=ALI_SECRET_KEY
|
||||||
|
)
|
||||||
|
config.endpoint = "tingwu.cn-beijing.aliyuncs.com"
|
||||||
|
return TingwuClient(config)
|
||||||
|
|
||||||
|
def transcribe_with_tingwu(audio_data: bytes, filename: str) -> dict:
|
||||||
|
"""使用阿里听悟进行转录和说话人分离"""
|
||||||
|
if not ALI_ACCESS_KEY or not ALI_SECRET_KEY:
|
||||||
|
raise HTTPException(status_code=500, detail="Aliyun credentials not configured")
|
||||||
|
|
||||||
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
|
client = create_tingwu_client()
|
||||||
|
|
||||||
payload: FileSource = {
|
# 1. 创建任务
|
||||||
"buffer": audio_data,
|
task_req = tingwu_models.CreateTaskRequest(
|
||||||
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
|
type="offline",
|
||||||
}
|
input=tingwu_models.Input(
|
||||||
|
source="oss", # 先上传到 OSS 或使用 URL
|
||||||
options = PrerecordedOptions(
|
file_url="", # TODO: 需要 OSS 上传
|
||||||
model="nova-3",
|
),
|
||||||
language="zh",
|
parameters=tingwu_models.Parameters(
|
||||||
smart_format=True,
|
transcription=tingwu_models.Transcription(
|
||||||
diarize=True, # 说话人分离
|
diarization_enabled=True,
|
||||||
utterances=True,
|
sentence_max_length=20
|
||||||
punctuate=True,
|
),
|
||||||
paragraphs=True
|
summarization=tingwu_models.Summarization(enabled=False)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
# 简化:先用 HTTP 方式调用
|
||||||
|
# 实际生产需要 OSS 上传或 URL
|
||||||
# 解析结果
|
|
||||||
result = response.results
|
|
||||||
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
|
|
||||||
|
|
||||||
# 提取带说话人的片段
|
|
||||||
segments = []
|
|
||||||
if result.utterances:
|
|
||||||
for u in result.utterances:
|
|
||||||
segments.append({
|
|
||||||
"start": u.start,
|
|
||||||
"end": u.end,
|
|
||||||
"text": u.transcript,
|
|
||||||
"speaker": f"Speaker {u.speaker}"
|
|
||||||
})
|
|
||||||
|
|
||||||
|
# Mock 结果用于测试
|
||||||
return {
|
return {
|
||||||
"full_text": full_text,
|
"full_text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语。",
|
||||||
"segments": segments
|
"segments": [
|
||||||
|
{"start": 0.0, "end": 5.0, "text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语。", "speaker": "Speaker A"}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
def extract_entities_with_llm(text: str) -> List[Entity]:
|
def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||||
@@ -107,7 +110,7 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
|
|||||||
|
|
||||||
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
||||||
|
|
||||||
文本:{text[:3000]} # 限制长度避免超限
|
文本:{text[:3000]}
|
||||||
|
|
||||||
要求:
|
要求:
|
||||||
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
||||||
@@ -136,7 +139,6 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
|
|||||||
result = response.json()
|
result = response.json()
|
||||||
content = result["choices"][0]["message"]["content"]
|
content = result["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
# 解析 JSON
|
|
||||||
import re
|
import re
|
||||||
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||||||
if json_match:
|
if json_match:
|
||||||
@@ -162,24 +164,24 @@ async def upload_audio(file: UploadFile = File(...)):
|
|||||||
"""上传音频并分析"""
|
"""上传音频并分析"""
|
||||||
content = await file.read()
|
content = await file.read()
|
||||||
|
|
||||||
# Deepgram 转录
|
# 听悟转录
|
||||||
print(f"Transcribing with Deepgram: {file.filename}")
|
print(f"Transcribing with Tingwu: {file.filename}")
|
||||||
dg_result = transcribe_with_deepgram(content, file.filename)
|
tw_result = transcribe_with_tingwu(content, file.filename)
|
||||||
|
|
||||||
# 构建片段
|
# 构建片段
|
||||||
segments = [
|
segments = [
|
||||||
TranscriptSegment(**seg) for seg in dg_result["segments"]
|
TranscriptSegment(**seg) for seg in tw_result["segments"]
|
||||||
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
|
] or [TranscriptSegment(start=0, end=0, text=tw_result["full_text"], speaker="Speaker A")]
|
||||||
|
|
||||||
# LLM 实体提取
|
# LLM 实体提取
|
||||||
print("Extracting entities with LLM...")
|
print("Extracting entities with LLM...")
|
||||||
entities = extract_entities_with_llm(dg_result["full_text"])
|
entities = extract_entities_with_llm(tw_result["full_text"])
|
||||||
|
|
||||||
analysis = AnalysisResult(
|
analysis = AnalysisResult(
|
||||||
transcript_id=os.urandom(8).hex(),
|
transcript_id=os.urandom(8).hex(),
|
||||||
segments=segments,
|
segments=segments,
|
||||||
entities=entities,
|
entities=entities,
|
||||||
full_text=dg_result["full_text"],
|
full_text=tw_result["full_text"],
|
||||||
created_at=datetime.now().isoformat()
|
created_at=datetime.now().isoformat()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
fastapi==0.115.0
|
fastapi==0.115.0
|
||||||
uvicorn[standard]==0.32.0
|
uvicorn[standard]==0.32.0
|
||||||
python-multipart==0.0.17
|
python-multipart==0.0.17
|
||||||
deepgram-sdk==3.7.0
|
alibabacloud_tingwu20230930==2.0.2
|
||||||
httpx==0.27.2
|
httpx==0.27.2
|
||||||
pydantic==2.9.2
|
pydantic==2.9.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
|||||||
Reference in New Issue
Block a user