Mineru的api调用

import requests
import time
import os

class PDFProcessor:
    def __init__(self, token):
        self.token = token
        self.headers = {
            'Authorization': f'Bearer {token}'
        }
    
    def upload_file(self, file_path):
        """上传本地文件到临时存储"""
        print(f"📤 正在上传文件: {os.path.basename(file_path)}")
        
        # 使用 tmpfiles.org
        try:
            with open(file_path, 'rb') as f:
                response = requests.post(
                    'https://tmpfiles.org/api/v1/upload',
                    files={'file': f}
                )
                if response.status_code == 200:
                    result = response.json()
                    # 获取直接下载链接
                    url = result['data']['url']
                    direct_url = url.replace('tmpfiles.org/', 'tmpfiles.org/dl/')
                    print(f"✅ 上传成功: {direct_url}")
                    return direct_url
        except Exception as e:
            print(f"❌ 上传失败: {e}")
        
        return None
    
    def process_pdf(self, file_path):
        """处理本地PDF文件"""
        # 1. 上传文件
        pdf_url = self.upload_file(file_path)
        if not pdf_url:
            return None
        
        # 2. 创建MinerU任务
        print("📄 创建解析任务...")
        task_url = 'https://mineru.net/api/v4/extract/task'
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.token}'
        }
        data = {
            'url': pdf_url,
            'is_ocr': True,
            'enable_formula': True,
            'enable_table': True,
            'language': 'auto'  # 中文文档
        }
        
        response = requests.post(task_url, headers=headers, json=data)
        result = response.json()
        
        if result['code'] != 0:
            print(f"❌ 创建任务失败: {result['msg']}")
            return None
        
        task_id = result['data']['task_id']
        print(f"✅ 任务ID: {task_id}")
        
        # 3. 等待处理完成
        print("⏳ 等待处理...")
        while True:
            time.sleep(5)
            status_url = f'https://mineru.net/api/v4/extract/task/{task_id}'
            status_response = requests.get(status_url, headers=headers)
            status_data = status_response.json()
            
            state = status_data['data']['state']
            
            if state == 'done':
                zip_url = status_data['data']['full_zip_url']
                print(f"✅ 处理完成!")
                print(f"📦 下载地址: {zip_url}")
                
                # 下载结果
                self.download_result(zip_url, task_id)
                return status_data
                
            elif state == 'failed':
                print(f"❌ 处理失败: {status_data['data']['err_msg']}")
                return None
                
            elif state == 'running':
                progress = status_data['data'].get('extract_progress', {})
                extracted = progress.get('extracted_pages', 0)
                total = progress.get('total_pages', 0)
                print(f"⏳ 正在处理... {extracted}/{total} 页")
            
            else:
                print(f"📊 状态: {state}")
    
    def download_result(self, zip_url, task_id):
        """下载结果文件"""
        save_path = f"mineru_result_{task_id}.zip"
        
        try:
            response = requests.get(zip_url, stream=True)
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✅ 结果已保存到: {save_path}")
        except Exception as e:
            print(f"❌ 下载失败: {e}")

# 使用您的Token和文件
TOKEN = "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIyOTkwMzQ1NiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1MTM3NDYyOCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiMTg5NTIxNTUyNTAiLCJvcGVuSWQiOm51bGwsInV1aWQiOiI1YTA0YmUxMC1kMTJkLTQ3NzktYjYyYi1mM2U4NTRmZWI0YTQiLCJlbWFpbCI6IiIsImV4cCI6MTc1MjU4NDIyOH0.kKzBBW2Jp2sVh3HXRVRlz-Df8WMHcDB7PM1pZbrmn3_QEt39bw3OrNAf8OkrmgY9Kign1fpTxPRzxopOenqO7Q"

# 处理您的PDF文件
processor = PDFProcessor(TOKEN)
local_pdf = "/home/wang/1_time/00bjfei/宁德时代宁德时代新能源科技股份有限公司2024年半年度报告174页(1).pdf"

# 检查文件是否存在
if os.path.exists(local_pdf):
    processor.process_pdf(local_pdf)
else:
    print(f"❌ 文件不存在: {local_pdf}")

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注