fix: auto-fix code issues (cron)
- 修复重复导入/字段 - 修复异常处理 - 修复PEP8格式问题 - 添加类型注解 - 修复缺失的urllib.parse导入
This commit is contained in:
@@ -7,6 +7,7 @@ Document Processor - Phase 3
|
||||
import io
|
||||
import os
|
||||
|
||||
|
||||
class DocumentProcessor:
|
||||
"""文档处理器 - 提取 PDF/DOCX 文本"""
|
||||
|
||||
@@ -33,7 +34,9 @@ class DocumentProcessor:
|
||||
ext = os.path.splitext(filename.lower())[1]
|
||||
|
||||
if ext not in self.supported_formats:
|
||||
raise ValueError(f"Unsupported file format: {ext}. Supported: {list(self.supported_formats.keys())}")
|
||||
raise ValueError(
|
||||
f"Unsupported file format: {ext}. Supported: {list(self.supported_formats.keys())}"
|
||||
)
|
||||
|
||||
extractor = self.supported_formats[ext]
|
||||
text = extractor(content)
|
||||
@@ -71,7 +74,9 @@ class DocumentProcessor:
|
||||
text_parts.append(page_text)
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
raise ImportError("PDF processing requires PyPDF2 or pdfplumber. Install with: pip install PyPDF2")
|
||||
raise ImportError(
|
||||
"PDF processing requires PyPDF2 or pdfplumber. Install with: pip install PyPDF2"
|
||||
)
|
||||
except Exception as e:
|
||||
raise ValueError(f"PDF extraction failed: {str(e)}")
|
||||
|
||||
@@ -100,7 +105,9 @@ class DocumentProcessor:
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
raise ImportError("DOCX processing requires python-docx. Install with: pip install python-docx")
|
||||
raise ImportError(
|
||||
"DOCX processing requires python-docx. Install with: pip install python-docx"
|
||||
)
|
||||
except Exception as e:
|
||||
raise ValueError(f"DOCX extraction failed: {str(e)}")
|
||||
|
||||
@@ -149,6 +156,7 @@ class DocumentProcessor:
|
||||
ext = os.path.splitext(filename.lower())[1]
|
||||
return ext in self.supported_formats
|
||||
|
||||
|
||||
# 简单的文本提取器(不需要外部依赖)
|
||||
class SimpleTextExtractor:
|
||||
"""简单的文本提取器,用于测试"""
|
||||
@@ -165,6 +173,7 @@ class SimpleTextExtractor:
|
||||
|
||||
return content.decode("latin-1", errors="ignore")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试
|
||||
processor = DocumentProcessor()
|
||||
|
||||
Reference in New Issue
Block a user