import requests
import time
import os
class PDFProcessor:
def __init__(self, token):
self.token = token
self.headers = {
'Authorization': f'Bearer {token}'
}
def upload_file(self, file_path):
"""上传本地文件到临时存储"""
print(f"📤 正在上传文件: {os.path.basename(file_path)}")
# 使用 tmpfiles.org
try:
with open(file_path, 'rb') as f:
response = requests.post(
'https://tmpfiles.org/api/v1/upload',
files={'file': f}
)
if response.status_code == 200:
result = response.json()
# 获取直接下载链接
url = result['data']['url']
direct_url = url.replace('tmpfiles.org/', 'tmpfiles.org/dl/')
print(f"✅ 上传成功: {direct_url}")
return direct_url
except Exception as e:
print(f"❌ 上传失败: {e}")
return None
def process_pdf(self, file_path):
"""处理本地PDF文件"""
# 1. 上传文件
pdf_url = self.upload_file(file_path)
if not pdf_url:
return None
# 2. 创建MinerU任务
print("📄 创建解析任务...")
task_url = 'https://mineru.net/api/v4/extract/task'
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.token}'
}
data = {
'url': pdf_url,
'is_ocr': True,
'enable_formula': True,
'enable_table': True,
'language': 'auto' # 中文文档
}
response = requests.post(task_url, headers=headers, json=data)
result = response.json()
if result['code'] != 0:
print(f"❌ 创建任务失败: {result['msg']}")
return None
task_id = result['data']['task_id']
print(f"✅ 任务ID: {task_id}")
# 3. 等待处理完成
print("⏳ 等待处理...")
while True:
time.sleep(5)
status_url = f'https://mineru.net/api/v4/extract/task/{task_id}'
status_response = requests.get(status_url, headers=headers)
status_data = status_response.json()
state = status_data['data']['state']
if state == 'done':
zip_url = status_data['data']['full_zip_url']
print(f"✅ 处理完成!")
print(f"📦 下载地址: {zip_url}")
# 下载结果
self.download_result(zip_url, task_id)
return status_data
elif state == 'failed':
print(f"❌ 处理失败: {status_data['data']['err_msg']}")
return None
elif state == 'running':
progress = status_data['data'].get('extract_progress', {})
extracted = progress.get('extracted_pages', 0)
total = progress.get('total_pages', 0)
print(f"⏳ 正在处理... {extracted}/{total} 页")
else:
print(f"📊 状态: {state}")
def download_result(self, zip_url, task_id):
"""下载结果文件"""
save_path = f"mineru_result_{task_id}.zip"
try:
response = requests.get(zip_url, stream=True)
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ 结果已保存到: {save_path}")
except Exception as e:
print(f"❌ 下载失败: {e}")
# 使用您的Token和文件
TOKEN = "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIyOTkwMzQ1NiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1MTM3NDYyOCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiMTg5NTIxNTUyNTAiLCJvcGVuSWQiOm51bGwsInV1aWQiOiI1YTA0YmUxMC1kMTJkLTQ3NzktYjYyYi1mM2U4NTRmZWI0YTQiLCJlbWFpbCI6IiIsImV4cCI6MTc1MjU4NDIyOH0.kKzBBW2Jp2sVh3HXRVRlz-Df8WMHcDB7PM1pZbrmn3_QEt39bw3OrNAf8OkrmgY9Kign1fpTxPRzxopOenqO7Q"
# 处理您的PDF文件
processor = PDFProcessor(TOKEN)
local_pdf = "/home/wang/1_time/00bjfei/宁德时代宁德时代新能源科技股份有限公司2024年半年度报告174页(1).pdf"
# 检查文件是否存在
if os.path.exists(local_pdf):
processor.process_pdf(local_pdf)
else:
print(f"❌ 文件不存在: {local_pdf}")