Crawl4ai (2025年6月16日视频)

可加V:adoresever

1. web_scrapping.py :
# ── requirements ─────────────────────────────────────────────────────────
# pip install crawl4ai openai pydantic python-dotenv
# playwright install
from typing import List, Optional
import os, json, asyncio
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
LLMConfig
)

from crawl4ai.extraction_strategy import LLMExtractionStrategy ##LLM策略




# ── 1. load keys ─────────────────────────────────────────────────────────
load_dotenv() # puts keys in env vars
URL_TO_SCRAPE = “https://en.wikipedia.org/wiki/British_Shorthair”

# ── 2. declare a schema that matches the *instruction* ───────────────────
class BritishShorthairInfo(BaseModel):
breed_name: str = Field(…, description=”The common name of the cat breed, e.g., ‘British Shorthair'”)
origin_country: str = Field(…, description=”The country of origin for the British Shorthair.”)
history_summary: str = Field(…, description=”A concise summary of the history of the British Shorthair.”)
key_characteristics: List[str] = Field(…, description=”A list of the main physical characteristics of the British Shorthair, such as body type, head shape, eyes, coat, and tail.”)
temperament: str = Field(…, description=”Typical temperament and personality traits of the British Shorthair.”)
common_colors: Optional[List[str]] = Field(None, description=”A list of common coat colors for the British Shorthair, e.g., ‘blue’, ‘cream’, ‘black’.”)
average_weight_kg: Optional[float] = Field(None, description=”The average weight of the British Shorthair in kilograms. If a range is provided, calculate the midpoint, otherwise null.”)
lifespan_years: Optional[str] = Field(None, description=”The average lifespan of the British Shorthair in years, retain original text format (e.g., ’12-15 years’).”)
health_issues: Optional[List[str]] = Field(None, description=”A list of known common health issues or predispositions for the British Shorthair.”)
care_and_maintenance: Optional[str] = Field(None, description=”Requirements for care and daily maintenance of the British Shorthair.”)
recognition: Optional[List[str]] = Field(None, description=”A list of cat associations or organizations that recognize the breed.”)

INSTRUCTION_TO_LLM = “””
You are provided with a Wikipedia page about the British Shorthair cat breed.
Your task is to extract detailed information about this cat.
For each field in the schema, extract the relevant information directly from the text.
– For `history_summary`, provide a concise summary of its main historical development.
– For `key_characteristics`, list the unique physical traits mentioned in the text (e.g., body, head, eyes, fur).
– For list fields (e.g., `common_colors`, `health_issues`, `recognition`), extract all relevant items mentioned.
– For numerical fields like `average_weight_kg` and `lifespan_years`, extract the precise numerical value or range. If weight is given as a range, provide the midpoint if possible, otherwise null. For lifespan, retain the original text format (e.g., “12-15 years”).
– If specific information is not explicitly found, set the corresponding field to `null`.
Return **only** valid JSON matching the schema – no additional text or markdown formatting.
“””


# ── 3. DeepSeek is OpenAI-compatible, so pass base_url + model name ──────
llm_cfg = LLMConfig(
provider=”deepseek/deepseek-chat”, # ✅ include model in the provider string
api_token=os.getenv(‘DEEPSEEK_API_KEY’),
# base_url=”https://api.deepseek.com/v1″
)

# ── 4. attach the extraction strategy ────────────────────────────────────
llm_strategy = LLMExtractionStrategy(
llm_config=llm_cfg,
schema=BritishShorthairInfo.model_json_schema(),
extraction_type=”schema”,
instruction=INSTRUCTION_TO_LLM,
chunk_token_threshold=1000,
apply_chunking=True, overlap_rate=0.0,
input_format=”markdown”,
)

crawl_cfg = CrawlerRunConfig(
extraction_strategy=llm_strategy,
cache_mode=CacheMode.DISABLED,
remove_overlay_elements=True,
exclude_external_links=True,
)

browser_cfg = BrowserConfig(headless=True, verbose=True, text_mode=True)

# ── 5. run the crawl ─────────────────────────────────────────────────────
async def main():
async with AsyncWebCrawler(config=browser_cfg) as crawler:
result = await crawler.arun(URL_TO_SCRAPE, config=crawl_cfg)

if result.success:
data = json.loads(result.extracted_content)
print(“✅ extracted”, len(data), “items”)
for p in data[:10]: print(p)
else:
print(“❌ error:”, result.error_message)
print(llm_strategy.show_usage()) # token cost insight


if __name__ == “__main__”:
asyncio.run(main())


2. craw+paper.py :
# ── requirements ─────────────────────────────────────────────────────────
# pip install crawl4ai openai pydantic python-dotenv litellm
# playwright install

import os, json, asyncio
from pydantic import BaseModel, Field
from typing import List, Optional
from dotenv import load_dotenv
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
LLMConfig
)
from crawl4ai.extraction_strategy import LLMExtractionStrategy

# Import litellm for direct LLM calls for our paper generation agent
import litellm
import subprocess
import tempfile
# ── 1. Load keys ─────────────────────────────────────────────────────────
load_dotenv()
# Make sure your DEEPSEEK_API_KEY is set in your .env file or environment variables
# For example, in .env: DEEPSEEK_API_KEY=”sk-YOUR_DEEPSEEK_KEY”
URL_TO_SCRAPE = “https://en.wikipedia.org/wiki/British_Shorthair”
OUTPUT_MD_FILENAME = “british_shorthair_paper_draft.md” # Markdown for better formatting
OUTPUT_JSON_FILENAME = “british_shorthair_info.json”
# ── 2. Declare schemas and instructions ──────────────────────────────────

# Schema for structured data extraction from Wikipedia
class BritishShorthairInfo(BaseModel):
breed_name: str = Field(…, description=”The common name of the cat breed, e.g., ‘British Shorthair'”)
# 将以下字段改为 Optional[str]
origin_country: Optional[str] = Field(None, description=”The country of origin for the British Shorthair.”)
history_summary: Optional[str] = Field(None, description=”A concise summary of the history of the British Shorthair.”)
key_characteristics: List[str] = Field(…, description=”A list of the main physical characteristics of the British Shorthair, such as body type, head shape, eyes, coat, and tail.”)
temperament: Optional[str] = Field(None, description=”Typical temperament and personality traits of the British Shorthair.”)
common_colors: Optional[List[str]] = Field(None, description=”A list of common coat colors for the British Shorthair, e.g., ‘blue’, ‘cream’, ‘black’.”)
average_weight_kg: Optional[float] = Field(None, description=”The average weight of the British Shorthair in kilograms. If a range is provided, calculate the midpoint, otherwise null.”)
lifespan_years: Optional[str] = Field(None, description=”The average lifespan of the British Shorthair in years, retain original text format (e.g., ’12-15 years’).”)
health_issues: Optional[List[str]] = Field(None, description=”A list of known common health issues or predispositions for the British Shorthair.”)
care_and_maintenance: Optional[str] = Field(None, description=”Requirements for care and daily maintenance of the British Shorthair.”)
recognition: Optional[List[str]] = Field(None, description=”A list of cat associations or organizations that recognize the breed.”)
# Instruction for the first agent (Crawl4AI LLMExtractionStrategy)

EXTRACTION_INSTRUCTION_TO_LLM = “””
You are provided with a Wikipedia page about the British Shorthair cat breed.
Your task is to extract detailed information about this cat.
For each field in the schema, extract the relevant information directly from the text.
– For `history_summary`, provide a concise summary of its main historical development.
– For `key_characteristics`, list the unique physical traits mentioned in the text (e.g., body, head, eyes, fur).
– For list fields (e.g., `common_colors`, `health_issues`, `recognition`), extract all relevant items mentioned.
– For numerical fields like `average_weight_kg` and `lifespan_years`, extract the precise numerical value or range. If weight is given as a range, provide the midpoint if possible, otherwise null. For lifespan, retain the original text format (e.g., “12-15 years”).
– If specific information is not explicitly found, set the corresponding field to `null`.
Return **only** valid JSON matching the schema – no additional text or markdown formatting.
“””

# Instruction for the second agent (Paper Draft Generation Agent)
# This mimics a basic academic paper structure.
# You can deeply customize this instruction to match your exact thesis format!
PAPER_DRAFT_INSTRUCTION = “””
You are an expert academic writer. Your task is to compile the provided structured information about the British Shorthair cat into a preliminary research paper draft.
The draft should follow a standard academic structure.

**Instructions for formatting:**
– Use Markdown for headings and formatting.
– Ensure logical flow and coherence between sections.
– For lists within sections (e.g., characteristics, colors, health issues), use bullet points.
– If a section’s information is ‘null’ or very brief, clearly state its absence or keep the section concise.

**Paper Structure:**

# Title: A Comprehensive Overview of the British Shorthair Cat Breed

## 1. Introduction
– Briefly introduce the British Shorthair breed, its origin, and its general popularity.

## 2. History and Development
– Detail the historical background of the breed using the `history_summary`.

## 3. Physical Characteristics
– Describe the key physical traits of the British Shorthair based on `key_characteristics`. Emphasize their distinctive appearance.
– Include information on common coat colors (`common_colors`).

## 4. Temperament and Personality
– Discuss the typical temperament and personality traits (`temperament`).

## 5. Health and Lifespan
– Outline common health issues (`health_issues`).
– Provide information on their average lifespan (`lifespan_years`).

## 6. Care and Maintenance
– Describe the general care and maintenance requirements (`care_and_maintenance`).

## 7. Breed Recognition
– List the associations or organizations that recognize the breed (`recognition`).

## 8. Conclusion
– Summarize the key aspects of the British Shorthair, reinforcing its unique appeal.

## References (Placeholder)
– [Further research needed]

**Provided Structured Data:**
“`json
{cat_info_json}
Based on the above structured data, generate the paper draft.
“””

# ── 3. Configure LLM for extraction ───────────────────────────────────────
llm_cfg_crawl4ai = LLMConfig(
provider=”deepseek/deepseek-chat”,
api_token=os.getenv(‘DEEPSEEK_API_KEY’),
# base_url=”https://api.deepseek.com/v1″ # Usually not needed if litellm knows the provider
)
DEEPSEEK_MODEL = “deepseek/deepseek-chat”
DEEPSEEK_API_KEY = os.getenv(‘DEEPSEEK_API_KEY’)
# Agent 1: Data Acquisition & Structuring Agent (using Crawl4AI)
async def acquire_and_structure_data(url: str) -> Optional[BritishShorthairInfo]:
print(“[AGENT 1: Data Acquisition] Starting web crawling and data extraction…”)
llm_strategy = LLMExtractionStrategy(
llm_config=llm_cfg_crawl4ai,
schema=BritishShorthairInfo.model_json_schema(),
extraction_type=”schema”,
instruction=EXTRACTION_INSTRUCTION_TO_LLM,
chunk_token_threshold=1000,
apply_chunking=True, overlap_rate=0.0,
input_format=”markdown”,
)
crawl_cfg = CrawlerRunConfig(
extraction_strategy=llm_strategy,
cache_mode=CacheMode.DISABLED,
remove_overlay_elements=True,
exclude_external_links=True,
)
browser_cfg = BrowserConfig(headless=True, verbose=True, text_mode=True)

async with AsyncWebCrawler(config=browser_cfg) as crawler:
result = await crawler.arun(url, config=crawl_cfg)

if result.success:
print(“✅ [AGENT 1] Data acquisition successful.”)
try:
extracted_content = result.extracted_content.strip()
if not extracted_content.startswith(‘[‘) and not extracted_content.endswith(‘]’):
extracted_list = [json.loads(extracted_content)]
else:
extracted_list = json.loads(extracted_content)

if not extracted_list:
print(“❌ [AGENT 1] Extracted content list is empty.”)
return None

extracted_data_dict = extracted_list[0]

if isinstance(extracted_data_dict, dict) and ‘error’ in extracted_data_dict:
temp_dict = extracted_data_dict.copy()
del temp_dict[‘error’]
extracted_data_dict = temp_dict

return BritishShorthairInfo.model_validate(extracted_data_dict)
except json.JSONDecodeError as e:
print(f”❌ [AGENT 1] Failed to decode JSON from extracted content: {e}”)
print(f”Content received (first 500 chars): {result.extracted_content[:500]}…”)
return None
except Exception as e:
print(f”❌ [AGENT 1] Error validating extracted data: {e}”)
return None
else:
print(f”❌ [AGENT 1] Data acquisition failed: {result.error_message}”)
if hasattr(llm_strategy, ‘show_usage’):
print(llm_strategy.show_usage())
return None

# Agent 2: Paper Draft Generation Agent
async def generate_paper_draft(cat_info: BritishShorthairInfo) -> Optional[str]:
print(“[AGENT 2: Paper Draft Generation] Starting paper draft creation…”)
if not cat_info:
print(“❌ [AGENT 2] No structured data provided for paper generation.”)
return None

cat_info_json = json.dumps(cat_info.model_dump(), indent=2, ensure_ascii=False)
final_instruction = PAPER_DRAFT_INSTRUCTION.format(cat_info_json=cat_info_json)

messages = [
{“role”: “system”, “content”: “You are an expert academic writer, skilled in compiling factual information into structured research papers.”},
{“role”: “user”, “content”: final_instruction}
]

try:
response = await litellm.acompletion(
model=DEEPSEEK_MODEL,
api_key=DEEPSEEK_API_KEY,
messages=messages,
temperature=0.7
)
draft_content = response.choices[0].message.content
print(“✅ [AGENT 2] Paper draft generated successfully.”)
return draft_content
except Exception as e:
print(f”❌ [AGENT 2] Error generating paper draft: {e}”)
return None

# Agent 3: Local Saving Agent (for Markdown and JSON)
def save_to_local_file(filename: str, content: str, is_json: bool = False) -> bool:
print(f”[AGENT 3: Local Saving] Attempting to save content to {filename}…”)
try:
mode = ‘w’
encoding = ‘utf-8’
if is_json:
# For JSON, content is already a dict/Pydantic model, dump it directly
with open(filename, mode, encoding=encoding) as f:
if isinstance(content, BaseModel):
f.write(content.model_dump_json(indent=2, exclude_none=True)) # Save Pydantic model as pretty JSON
else:
json.dump(content, f, indent=2, ensure_ascii=False) # Fallback for dict
else:
with open(filename, mode, encoding=encoding) as f:
f.write(content)
print(f”✅ [AGENT 3] Content successfully saved to {filename}”)
return True
except Exception as e:
print(f”❌ [AGENT 3] Error saving file: {e}”)
return False

# ── Agent 4: PDF Document Conversion Agent (using TeX Live via Pandoc) ─────────────────────────
def convert_md_to_pdf(input_md_file: str, output_pdf_file: str) -> bool:
print(f”[AGENT 4: PDF Conversion] Attempting to convert {input_md_file} to {output_pdf_file} using Pandoc and TeX Live…”)

# LaTeX Preamble content for headers/footers and Chinese support
# Ensure your system has the font specified in \setCJKmainfont (e.g., Noto Sans CJK SC)
latex_header_content = r”””
\usepackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{} % Clear all headers and footers

% Header: Right side (e.g., Document Title)
\rhead{英国短毛猫研究报告}
% Footer: Center (Page number)
\cfoot{第 \thepage 页}

\renewcommand{\headrulewidth}{0.4pt} % Line under header
\renewcommand{\footrulewidth}{0.4pt} % Line over footer

% For Chinese characters with XeLaTeX
\usepackage{xeCJK}
% Set a common Chinese font. You might need to install ‘fonts-noto-cjk’ on Linux.
% If ‘Noto Sans CJK SC’ is not found, try ‘Source Han Sans SC’, ‘WenQuanYi Micro Hei’ or ‘SimSun’.
\setCJKmainfont{Noto Sans CJK SC}
\XeTeXlinebreaklocale “zh” % Proper line breaking for Chinese
\XeTeXlinebreakskip = 0pt plus 1pt % Adjust line breaking stretch
“””

temp_header_file = None
try:
# Create a temporary file for the LaTeX header content
temp_header_file = tempfile.NamedTemporaryFile(mode=’w’, delete=False, suffix=’.tex’, encoding=’utf-8′)
temp_header_file.write(latex_header_content)
temp_header_file.close()

# Check if xelatex is available in the current PATH
try:
subprocess.run([‘xelatex’, ‘–version’], check=True, capture_output=True, text=True)
print(” ℹ️ xelatex found. Pandoc will use TeX Live for PDF generation.”)
except (subprocess.CalledProcessError, FileNotFoundError):
print(” ⚠️ xelatex not found or not working correctly. Pandoc might fail to create PDF.”)
print(” Please ensure TeX Live 2025 is fully installed and `xelatex` is in your system’s PATH.”)
# Continue anyway, as Pandoc might try other engines or the PATH issue might be transient

# Command: pandoc -s input.md -o output.pdf –pdf-engine=xelatex –include-in-header=temp_header.tex
# -s or –standalone: Produce a standalone document with appropriate headers.
# –pdf-engine=xelatex: Use xelatex for better Chinese support.
# –include-in-header: Inject custom LaTeX preamble.
command = [
‘pandoc’,
‘-s’,
input_md_file,
‘-o’,
output_pdf_file,
‘–pdf-engine=xelatex’,
f’–include-in-header={temp_header_file.name}’
]

result = subprocess.run(command, capture_output=True, text=True, check=True)

print(f”✅ [AGENT 4] Markdown converted to PDF successfully.”)
# print(“Pandoc stdout:”, result.stdout) # Uncomment for debug
# print(“Pandoc stderr:”, result.stderr) # Uncomment for debug
return True
except subprocess.CalledProcessError as e:
print(f”❌ [AGENT 4] Error converting Markdown to PDF: Pandoc exited with code {e.returncode}”)
print(“Pandoc stdout:”, e.stdout)
print(“Pandoc stderr:”, e.stderr)
print(“This often means Pandoc failed to compile LaTeX. Check Pandoc’s stderr for LaTeX errors.”)
print(“Ensure TeX Live is fully installed, `xelatex` is in PATH, and all necessary LaTeX packages (like fancyhdr, xeCJK) are installed (usually included in full TeX Live).”)
return False
except FileNotFoundError as e:
print(f”❌ [AGENT 4] Error: Command not found ({e}). Please ensure Pandoc and xelatex are installed and accessible in your PATH.”)
return False
except Exception as e:
print(f”❌ [AGENT 4] An unexpected error occurred during PDF conversion: {e}”)
return False
finally:
# Clean up the temporary header file
if temp_header_file and os.path.exists(temp_header_file.name):
os.remove(temp_header_file.name)
print(f” ℹ️ Cleaned up temporary LaTeX header file: {temp_header_file.name}”)


# ── 5. Orchestrate the automated workflow ───────────────────────────────
async def automated_workflow():
print(“🚀 Starting automated research paper draft workflow…”)

# Step 1: Data Acquisition & Structuring (Agent 1)
cat_info = await acquire_and_structure_data(URL_TO_SCRAPE)
if not cat_info:
print(“🛑 Workflow aborted due to data acquisition failure.”)
return

# Step 2: Save Extracted Structured Data as JSON
print(f”[WORKFLOW] Saving extracted data to {OUTPUT_JSON_FILENAME}”)
if not save_to_local_file(OUTPUT_JSON_FILENAME, cat_info, is_json=True):
print(“🛑 Workflow aborted due to JSON saving failure.”)
return
print(f”🎉 Structured data saved to: {os.path.abspath(OUTPUT_JSON_FILENAME)}”)

# Step 3: Paper Draft Generation (Agent 2) – from the structured data
paper_draft = await generate_paper_draft(cat_info) # Use the validated Pydantic object
if not paper_draft:
print(“🛑 Workflow aborted due to paper draft generation failure.”)
return

# Step 4: Local Saving Markdown (Agent 3)
if not save_to_local_file(OUTPUT_MD_FILENAME, paper_draft):
print(“🛑 Workflow completed with Markdown saving failure.”)
return

# Step 5: PDF Document Conversion (Agent 4)
pdf_output_path = OUTPUT_MD_FILENAME.replace(“.md”, “.pdf”)
if not convert_md_to_pdf(OUTPUT_MD_FILENAME, pdf_output_path):
print(“🛑 Workflow completed with PDF conversion failure.”)
return

print(“\n🎉 Automated workflow completed successfully! Check your files.”)
print(f”Structured data saved to: {os.path.abspath(OUTPUT_JSON_FILENAME)}”)
print(f”Markdown output saved to: {os.path.abspath(OUTPUT_MD_FILENAME)}”)
print(f”PDF output saved to: {os.path.abspath(pdf_output_path)}”)

if __name__ == “__main__”:
asyncio.run(automated_workflow())

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注