crawl4ai负责爬取,openai库调用DeepSeek进行分析,LaTeX负责生成报告

整体流程:

创建环境: conda create -n crawl_env python=3.11 -y
激活环境: conda activate crawl_env
初始化Conda:conda init bash
从网页抓取核心内容并转换为Markdown:pip install “crawl4ai>=0.6.0”
setup 用于安装浏览器依赖,doctor 用于诊断环境是否配置正确:crawl4ai-setup 和 crawl4ai-doctor
安装OpenAI官方的Python库用它来调用DeepSeek等兼容OpenAI API规范的模型:pip install openai
切换到root用户: sudo -i
安装思源系列中文字体:sudo apt-get install fonts-noto-cjk

目录结构:

源码:

# ai_analyzer.py (修正版,增强章节定位能力)
# 负责与DeepSeek API交互,进行多任务分析

import asyncio
import re
import json
import os
from openai import AsyncOpenAI
from config import DEEPSEEK_API_KEY, DEEPSEEK_BASE_URL, DEFAULT_MODEL, CACHE_FILE

client = AsyncOpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)

def _load_cache() -> dict:
    # ... (这部分函数保持不变) ...
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r', encoding='utf-8') as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                return {}
    return {}

def _save_cache(cache_data: dict):
    # ... (这部分函数保持不变) ...
    with open(CACHE_FILE, 'w', encoding='utf-8') as f:
        json.dump(cache_data, f, ensure_ascii=False, indent=4)

async def _call_ai(system_prompt: str, user_content: str, model: str) -> str | None:
    # ... (这部分函数保持不变) ...
    try:
        completion = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content}
            ],
            stream=False
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"❌ 调用AI时发生错误: {e}")
        return None

async def analyze_content(full_markdown: str, url: str, model: str = DEFAULT_MODEL) -> dict | None:
    cache = _load_cache()
    cache_key = url
    if cache_key in cache and all(k in cache[cache_key] for k in ["abstract_translation", "main_body_summary", "conclusion_summary"]):
        print("✅ 从本地缓存中加载AI分析结果!")
        return cache[cache_key]

    print(f"➡️ [步骤 2/4] 本地无缓存,正在连接AI进行分析 (模型: {model})...")
    
    prompts = {
        "abstract": "You are a professional academic translator. Your task is to accurately translate the following research paper abstract into simplified Chinese.",
        "main_body": "You are an expert academic analyst. Summarize the core contributions, methods, and key findings from the main body of the following article in about 300-500 words. Present your summary in a structured, easy-to-read format in simplified Chinese.",
        "conclusion": "You are an expert academic analyst. Your task is to summarize the conclusion section of the following article, highlighting the main takeaways and future work mentioned. Provide the summary in simplified Chinese."
    }
    abstract_regex = r'(?i)(?:#+\s*|\n)\s*(?:\d*\.?\s*)?Abstract\n(.*?)(?=\n#+\s|\n\d*\.?\s*Introduction)'
    conclusion_regex = r'(?i)(?:#+\s*|\n)\s*(?:\d*\.?\s*)?Conclusion(?:s)?\n(.*?)(?=\n#+\s|\n\d*\.?\s*(?:References|Acknowledgements|Appendix))'

    abstract_content_match = re.search(abstract_regex, full_markdown, re.DOTALL)
    conclusion_content_match = re.search(conclusion_regex, full_markdown, re.DOTALL)
    
    # 提取内容,如果找不到匹配项则提供明确提示
    abstract_text = abstract_content_match.group(1).strip() if abstract_content_match else "Abstract not found in the document."
    conclusion_text = conclusion_content_match.group(1).strip() if conclusion_content_match else "Conclusion not found in the document."

    # 主体内容逻辑保持不变
    main_body_content = full_markdown
    if abstract_content_match and conclusion_content_match:
       main_body_start = abstract_content_match.end()
       main_body_end = conclusion_content_match.start()
       main_body_content = full_markdown[main_body_start:main_body_end]

    tasks = {
        "abstract_translation": _call_ai(prompts["abstract"], abstract_text, model),
        "main_body_summary": _call_ai(prompts["main_body"], main_body_content, model),
        "conclusion_summary": _call_ai(prompts["conclusion"], conclusion_text, model),
    }
    
    results = await asyncio.gather(*tasks.values())
    summaries = dict(zip(tasks.keys(), results))

    if not all(summaries.values()):
        print("❌ AI总结失败,部分内容未能生成。")
        return None
    
    print("✅ AI分析完成!正在将结果存入本地缓存...")
    
    cache[cache_key] = summaries
    _save_cache(cache)
    
    return summaries

修改你需要的模型:

# config.py
# 存放所有配置信息

import os

# ==============================================================================
#  API与模型配置
# ==============================================================================

# 您的API密钥。
DEEPSEEK_API_KEY = "  "

# DeepSeek的API服务器地址
DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"

# 要使用的AI模型名称
DEFAULT_MODEL = "deepseek-chat"

# ==============================================================================
#  输出配置
# ==============================================================================

# 生成的报告存放的文件夹名称
OUTPUT_DIR = "latex_reports"

# ==============================================================================
#  缓存配置
# ==============================================================================
# AI分析结果的缓存文件
CACHE_FILE = "ai_cache.json"
# crawler.py
# 负责爬取网页内容

import re
from crawl4ai import AsyncWebCrawler

async def fetch_article_data(url: str) -> tuple[str | None, str | None]:
    """
    爬取指定URL的网页,提取Markdown内容和标题。
    
    返回: (markdown_content, title) 或 (None, None)
    """
    print(f"➡️ [步骤 1/4] 正在爬取文献内容: {url}")
    crawler = AsyncWebCrawler()
    try:
        result = await crawler.arun(url=url)
        if not result or not result.markdown:
            print(f"❌ 爬取失败: 未能从 {url} 提取到有效内容。")
            return None, None
        
        # 尝试从Markdown中提取第一个一级标题
        title_match = re.search(r"^#\s+(.*)", result.markdown, re.MULTILINE)
        title = title_match.group(1).strip() if title_match else "Untitled Document"
        
        print("✅ 内容爬取成功!")
        return result.markdown, title
    except Exception as e:
        print(f"❌ 爬取时发生错误: {e}")
        return None, None
# report_generator.py
# 负责生成LaTeX源码并编译成PDF

import os
import re
import subprocess
from datetime import datetime
from config import OUTPUT_DIR

def _latex_escape(text: str) -> str:
    """对文本进行转义以安全插入LaTeX。"""
    replacements = {
        '&': r'\&', '%': r'\%', '$': r'\$', '#': r'\#', '_': r'\_',
        '{': r'\{', '}': r'\}', '~': r'\textasciitilde{}',
        '^': r'\textasciicircum{}', '\\': r'\textbackslash{}',
    }
    return re.sub(r'[&%$#_{}\\~^]', lambda m: replacements[m.group(0)], text)

def _create_latex_source(data: dict) -> str:
    """根据数据生成LaTeX源文件内容。"""
    title_escaped = _latex_escape(data['title'])
    url_escaped = _latex_escape(data['url'])
    abstract_escaped = _latex_escape(data.get('abstract_translation', ''))
    main_body_escaped = _latex_escape(data.get('main_body_summary', ''))
    conclusion_escaped = _latex_escape(data.get('conclusion_summary', ''))

    latex_template = rf"""
\documentclass[12pt, a4paper]{{article}}
\usepackage{{ctex}}
\usepackage[top=2.5cm, bottom=2.5cm, left=2.5cm, right=2.5cm]{{geometry}}
\usepackage{{fancyhdr}}
\usepackage{{hyperref}}
\usepackage{{titling}}
\setmainfont{{Times New Roman}}

\pagestyle{{fancy}}
\fancyhf{{}}
\fancyhead[C]{{{title_escaped}}}
\fancyfoot[C]{{\thepage}}
\renewcommand{{\headrulewidth}}{{0.4pt}}
\renewcommand{{\footrulewidth}}{{0.4pt}}

\pretitle{{\begin{{center}}\LARGE\bfseries}}\posttitle{{\end{{center}}}}
\preauthor{{\begin{{center}}\large}}\postauthor{{\end{{center}}}}
\predate{{\begin{{center}}\large}}\postdate{{\end{{center}}}}

\title{{{title_escaped}}}
\author{{文献来源: \href{{{url_escaped}}}{{{url_escaped}}}}}
\date{{AI总结报告生成于: {data['date']}}}

\begin{{document}}
\maketitle
\thispagestyle{{fancy}}
\section*{{摘要翻译}}
{abstract_escaped}
\section*{{核心内容总结}}
{main_body_escaped}
\section*{{结论总结}}
{conclusion_escaped}
\end{{document}}
"""
    return latex_template


def generate_pdf_report(report_data: dict):
    print("➡️ [步骤 3/4] 正在生成LaTeX报告源文件...")
    tex_source = _create_latex_source(report_data)
    print("✅ LaTeX源文件生成完毕!")
    
    print("➡️ [步骤 4/4] 正在编译PDF报告...")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    title = report_data.get('title', 'report')
    filename_base = re.sub(r'[\\/*?:"<>|]', "", title).replace(" ", "_")[:50]
    tex_filepath = os.path.join(OUTPUT_DIR, f"{filename_base}.tex")
    
    with open(tex_filepath, 'w', encoding='utf-8') as f:
        f.write(tex_source)

    command = ['xelatex', '-interaction=nonstopmode', f'-output-directory={OUTPUT_DIR}', tex_filepath]
    
    for i in range(2):
        print(f"   ... LaTeX编译中 (第 {i+1}/2 轮)")
        result = subprocess.run(command, capture_output=True, text=True, encoding='utf-8')
        if result.returncode != 0:
            log_path = os.path.join(OUTPUT_DIR, f'{filename_base}.log')
            print(f"❌ PDF编译失败!请查看日志: {log_path}")
            print("-" * 20 + " LaTeX 错误日志 " + "-" * 20)
            if os.path.exists(log_path):
                with open(log_path, 'r', encoding='utf-8') as log_file:
                    print("".join(log_file.readlines()[-30:]))
            print("-" * 55)
            return
            
    for ext in ['.aux', '.log', '.out', '.tex']:
        try:
            os.remove(os.path.join(OUTPUT_DIR, f"{filename_base}{ext}"))
        except OSError:
            pass

    pdf_filepath = os.path.join(OUTPUT_DIR, f"{filename_base}.pdf")
    print(f"🎉 报告生成成功!文件已保存至: {os.path.abspath(pdf_filepath)}")

# run.py (带缓存功能)
# 主程序入口,负责调度所有模块

import asyncio
import sys
import argparse
import subprocess
import os
import json
from datetime import datetime

import config
from crawler import fetch_article_data
from ai_analyzer import analyze_content
from report_generator import generate_pdf_report

def check_dependencies():
    """检查必要的外部依赖(API密钥和LaTeX)。"""
    if not config.DEEPSEEK_API_KEY:
         print("❌ 错误: API密钥未在 config.py 中配置!")
         sys.exit(1)
    
    try:
        subprocess.run(['xelatex', '-version'], check=True, capture_output=True)
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("❌ 错误: 系统中未找到 'xelatex' 命令。请先安装LaTeX发行版。")
        sys.exit(1)

async def main():
    """主执行流程"""
    parser = argparse.ArgumentParser(description="学术文献AI总结报告生成器 V2.1 (带缓存)")
    parser.add_argument('url', help="要处理的学术文献URL。")
    parser.add_argument('--model', default=config.DEFAULT_MODEL, help=f"使用的DeepSeek模型 (默认: {config.DEFAULT_MODEL})。")
    parser.add_argument('--force-reanalyze', action='store_true', help="强制重新进行AI分析,忽略此URL的现有缓存。")
    args = parser.parse_args()

    # 如果用户选择强制刷新,我们先从缓存中删除该URL的记录
    if args.force_reanalyze and os.path.exists(config.CACHE_FILE):
        print("🌀 用户选择强制重新分析,将更新此URL的缓存。")
        try:
            with open(config.CACHE_FILE, 'r', encoding='utf-8') as f:
                cache = json.load(f)
            if args.url in cache:
                del cache[args.url]
                with open(config.CACHE_FILE, 'w', encoding='utf-8') as f:
                    json.dump(cache, f, ensure_ascii=False, indent=4)
                print(f"   已从缓存中移除URL: {args.url}")
        except (json.JSONDecodeError, FileNotFoundError):
            pass # 如果缓存文件有问题,忽略即可

    # 1. 爬取
    markdown, title = await fetch_article_data(args.url)
    if not markdown:
        return

    # 2. AI分析
    summaries = await analyze_content(markdown, args.url, args.model)
    if not summaries:
        return

    # 3. 整合数据并生成报告
    report_data = {
        "title": title,
        "url": args.url,
        "date": datetime.now().strftime('%Y年%m月%d日'),
        **summaries
    }
    generate_pdf_report(report_data)

if __name__ == "__main__":
    print("--- 启动报告生成器 (带缓存功能) ---")
    check_dependencies()
    asyncio.run(main())
    print("--- 报告生成器运行完毕 ---")

爬取生成PDF:在latex_reports文件夹下

AutoGen框架深度解析:我如何用Python构建一个“AI足球分析师”团队?

*注意修改配置栏*

import asyncio
import json
import os
from datetime import datetime
from tavily import TavilyClient
from autogen_agentchat.agents import AssistantAgent
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_agentchat.messages import TextMessage

# ================== 配置 ==================
API_KEY = “sk-your API_KEY”
BASE_URL = “模型对应的URL”
MODEL = “模型名称”
TAVILY_API_KEY = “tvly-your TAVILY_API_KEY”

# 初始化客户端
model_client = OpenAIChatCompletionClient(model=MODEL, base_url=BASE_URL, api_key=API_KEY, model_info={“vision”: False, “function_calling”: True, “json_output”: True, “family”: “gpt-4”}, temperature=0.3)
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)

# ================== 定义专家Agent团队 (全自动版) ==================

# Agent 1: 足球数据研究员 (负责规划与搜索)
research_planner = AssistantAgent(
name=”Research_Planner”,
model_client=model_client,
system_message=”””你是顶级的足球数据研究员。你的任务是接收用户关于一场比赛的预测问题,
然后制定一个全面的数据搜集计划,用于Tavily搜索引擎。

你的计划必须涵盖以下几个方面,以确保分析的全面性:
1. 两队之间的**历史交锋记录** (Head-to-Head)。
2. 每支球队**各自最近的比赛战绩**和状态 (Recent Form)。
3. 任何关于球队的**最新新闻**,如关键球员伤病、教练变动等 (Latest News)。

你的输出必须是标准的JSON格式,只包含一个’search_queries’键,其值为一个关键词列表。

示例输入: “预测 皇家马德里 vs 巴塞罗那 的比分”
你的输出:
{
“search_queries”: [
“Real Madrid vs Barcelona head to head results”,
“Real Madrid recent match results La Liga”,
“Barcelona recent match results La Liga”,
“Real Madrid team news injuries”,
“Barcelona team news formation”
]
}
“””
)

# Agent 2: 首席战术分析与评论员 (负责分析与报告)
final_reporter = AssistantAgent(
name=”Final_Reporter”,
model_client=model_client,
system_message=”””你是世界顶级的足球战术分析师兼专栏作家。
你的任务是接收一堆从网络上搜集来的、关于一场比赛的原始、非结构化文本资料。

你需要执行以下两个核心步骤来完成任务:
1. **数据提取与分析**: 首先,你必须仔细阅读所有资料,从中提取出结构化的关键信息,特别是【以往战绩】(包括历史交锋和近期战绩)。然后,基于这些提取出的数据进行深度战术分析,并形成你自己的胜率和比分预测。
2. **报告撰写**: 接着,你需要将你的所有分析和提取出的数据,撰写成一篇精彩、完整的赛前分析报告。

你的最终报告**必须**包含一个清晰的【以往战绩】部分,详细列出你找到的比赛记录。
报告的整体结构应包括:标题、核心看点、以往战绩回顾、战术分析、胜率与比分预测、总结。
“””
)

# ================== 工具与辅助函数 ==================

def perform_web_search_tool(queries: list) -> str:
print(f”\n— [Tool: Web Search] 正在执行深度搜索… —“)
raw_content = “”
try:
# 为了获取更全面的信息,我们对每个查询请求更多结果
all_results = []
for query in queries:
response = tavily_client.search(query=query, search_depth=”advanced”, max_results=5)
all_results.extend(response[‘results’])
raw_content = “\n\n—\n\n”.join([f”来源: {item.get(‘url’, ‘N/A’)}\n内容: {item.get(‘content’, ”)}” for item in all_results])
print(f”— [Tool: Web Search] 搜索完成,共找到 {len(all_results)} 条结果 —“)
except Exception as e:
print(f”— [Tool: Web Search] 搜索出错: {e}”)
return raw_content

def save_report_to_file(report_content: str, teams_input: str) -> None:
os.makedirs(“soccer_automated_reports”, exist_ok=True)
timestamp = datetime.now().strftime(“%Y%m%d_%H%M%S”)
safe_subject = “”.join([c if c.isalnum() else “_” for c in teams_input.replace(“vs”, “”)[:30]])
filename = f”soccer_automated_reports/{safe_subject}_{timestamp}.txt”
with open(filename, ‘w’, encoding=’utf-8′) as f: f.write(report_content)
print(f”\n— [System] 最终分析报告已保存至: {filename} —“)

# ================== 主工作流 (全自动版) ==================

async def process_prediction_request(core_question: str):
print(f”\n> 收到预测任务: {core_question}”)

# — 阶段一: 研究员规划并执行搜索 —
print(“\n— [Research Planner] 正在规划数据搜集策略… —“)

planner_input = TextMessage(content=core_question, source=”user”)
response_planner = await research_planner.on_messages([planner_input], None)
plan_json = response_planner.chat_message.content

print(“— [Research Planner] 搜集计划已生成 —“)
print(plan_json)

try:
plan = json.loads(plan_json.strip().lstrip(““`json”).rstrip(““`”))
search_queries = plan.get(“search_queries”, [])
except json.JSONDecodeError:
print(“错误:研究员未能生成有效的JSON计划,任务中断。”)
return

if not search_queries:
print(“未能从计划中提取到搜索关键词。”)
return

# (执行工具)
raw_data_from_web = perform_web_search_tool(search_queries)

# — 阶段二: 最终报告人进行分析与撰写 —
print(“\n— [Final Reporter] 正在分析数据并撰写最终报告… —“)

reporter_prompt = f”””
请基于以下从网络上搜集到的关于 ‘{core_question}’ 的原始资料,
提取关键战绩,进行深度分析,并撰写一份包含【以往战绩】的最终报告。

【原始资料】:

{raw_data_from_web}

“””
reporter_input = TextMessage(content=reporter_prompt, source=”user”)
response_reporter = await final_reporter.on_messages([reporter_input], None)
final_report = response_reporter.chat_message.content

print(“\n” + “=”*60 + “\n📋 最终分析报告 📋\n” + “=”*60)
print(final_report)

# 提取队名用于保存文件
try:
teams_input = core_question.split(“预测”)[1].split(“的”)[0].strip()
except:
teams_input = “match_prediction”

save_report_to_file(final_report, teams_input)


async def main():
print(“=”*60 + “\n⚽ 足球赛事预测系统启动 ⚽\n” + “=”*60)

while True:
print(“\n” + “#”*60)
core_question = input(“请输入您想预测的比赛 (例如: 预测 徐州队 vs 无锡队 的胜负和比分),或输入’exit’退出:\n> “)
if core_question.strip().lower() in [‘exit’, ‘quit’]:
print(“感谢使用,再见!”)
break
if not core_question.strip():
continue

await process_prediction_request(core_question)

if __name__ == “__main__”:
try:
asyncio.run(main())
except KeyboardInterrupt:
print(“\n程序被中断。”)

如果下载失败,复制下面代码命名requirements.txt与上段代码(soccer_analyst_v3.py)保存在同一文件夹下

aiofiles==24.1.0
annotated-types==0.7.0
anyio==4.10.0
attrs==25.3.0
autogen-agentchat==0.7.2
autogen-core==0.7.2
autogen-ext==0.7.2
beautifulsoup4==4.13.4
certifi==2025.8.3
cffi==1.17.1
charset-normalizer==3.4.2
coloredlogs==15.0.1
cryptography==45.0.6
ddddocr==1.5.6
Deprecated==1.2.18
distro==1.9.0
exceptiongroup==1.3.0
flatbuffers==25.2.10
greenlet==3.2.4
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
humanfriendly==10.0
idna==3.10
img2pdf==0.6.1
importlib_metadata==8.7.0
jiter==0.10.0
jsonref==1.1.0
lxml==6.0.0
mpmath==1.3.0
numpy==2.2.6
onnxruntime==1.22.1
openai==1.99.3
opencv-python-headless==4.12.0.88
opentelemetry-api==1.36.0
outcome==1.3.0.post0
packaging==25.0
pdfminer.six==20250506
pdfplumber==0.11.7
pikepdf==9.10.2
pillow==11.3.0
playwright==1.54.0
protobuf==5.29.5
pycparser==2.22
pydantic==2.11.7
pydantic_core==2.33.2
pyee==13.0.0
PyMuPDF==1.26.3
pypdfium2==4.30.0
PySocks==1.7.1
pytesseract==0.3.13
python-dotenv==1.1.1
regex==2025.7.34
requests==2.32.4
selenium==4.34.2
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.7
sympy==1.14.0
tavily-python==0.7.10
tiktoken==0.10.0
tqdm==4.67.1
trio==0.30.0
trio-websocket==0.12.2
typing-inspection==0.4.1
typing_extensions==4.14.1
undetected-chromedriver==3.5.5
urllib3==2.5.0
webdriver-manager==4.0.2
websocket-client==1.8.0
websockets==15.0.1
wrapt==1.17.2
wsproto==1.2.0
zipp==3.23.0

指令1: (如果需要) 安装兼容的Python版本 (以Python 3.11为例)

如果你的系统已经有python3.11,可以跳过此步

sudo apt update
sudo apt install python3.11 python3.11-venv

指令2: 创建一个名为 .venv 的虚拟环境

python3.11 -m venv .venv

指令3: 激活(进入)这个虚拟环境

source .venv/bin/activate

指令4: (推荐) 升级环境中的pip工具

pip install –upgrade pip

指令5: 安装项目所需的所有依赖库

pip install -r requirements.txt

指令6:运行脚本

python soccer_analyst_v3.py