Appearance
Python SDK (Coming Soon)
We're actively working on an official Python SDK for LLMCrawl. It will provide the same powerful features as our JavaScript SDK with Pythonic APIs.
Expected Features
- Simple Integration: Easy-to-use Python classes and methods
- Type Hints: Full typing support for better IDE experience
- Async Support: Both synchronous and asynchronous APIs
- AI Extraction: Structured data extraction with Pydantic models
- Comprehensive: Full API coverage including scraping, crawling, and mapping
Preview API
python
# Expected API design (subject to change)
from llmcrawl import LLMCrawl
client = LLMCrawl(api_key="your-api-key")
# Scrape a single page
result = await client.scrape("https://example.com")
print(result.data.markdown)
# AI-powered extraction with Pydantic
from pydantic import BaseModel
class Product(BaseModel):
name: str
price: float
in_stock: bool
result = await client.scrape(
"https://store.example.com/product/123",
extract_model=Product
)
product = result.data.extract # Type: Product
Current Alternative: REST API
While we work on the official Python SDK, you can use our REST API directly:
python
import requests
import json
class LLMCrawlClient:
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.llmcrawl.dev/v1"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def scrape(self, url: str, **options):
data = {"url": url, **options}
response = requests.post(
f"{self.base_url}/scrape",
headers=self.headers,
json=data
)
return response.json()
def crawl(self, url: str, **options):
data = {"url": url, **options}
response = requests.post(
f"{self.base_url}/crawl",
headers=self.headers,
json=data
)
return response.json()
def get_crawl_status(self, job_id: str):
response = requests.get(
f"{self.base_url}/crawl/{job_id}",
headers=self.headers
)
return response.json()
# Usage
client = LLMCrawlClient("your-api-key")
# Scrape with AI extraction
result = client.scrape(
"https://example.com",
extract={
"schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"content": {"type": "string"}
}
}
}
)
if result["success"]:
extracted_data = json.loads(result["data"]["extract"])
print(f"Title: {extracted_data['title']}")
Using with Popular Python Libraries
With Pydantic for Type Safety
python
from pydantic import BaseModel
from typing import List, Optional
import json
class Article(BaseModel):
title: str
author: str
content: str
tags: List[str]
publish_date: Optional[str] = None
# Define extraction schema
article_schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"author": {"type": "string"},
"content": {"type": "string"},
"tags": {"type": "array", "items": {"type": "string"}},
"publish_date": {"type": "string"}
},
"required": ["title", "author", "content"]
}
# Scrape and validate
result = client.scrape(
"https://news.example.com/article",
extract={"schema": article_schema}
)
if result["success"]:
# Parse and validate with Pydantic
extracted_data = json.loads(result["data"]["extract"])
article = Article(**extracted_data)
print(f"Article: {article.title} by {article.author}")
With AsyncIO for Concurrent Scraping
python
import asyncio
import aiohttp
from typing import List, Dict
class AsyncLLMCrawlClient:
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.llmcrawl.dev/v1"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
async def scrape(self, session: aiohttp.ClientSession, url: str, **options):
data = {"url": url, **options}
async with session.post(
f"{self.base_url}/scrape",
headers=self.headers,
json=data
) as response:
return await response.json()
async def scrape_multiple(self, urls: List[str], **options):
async with aiohttp.ClientSession() as session:
tasks = [self.scrape(session, url, **options) for url in urls]
return await asyncio.gather(*tasks)
# Usage
async def main():
client = AsyncLLMCrawlClient("your-api-key")
urls = [
"https://example1.com",
"https://example2.com",
"https://example3.com"
]
results = await client.scrape_multiple(urls, formats=["markdown"])
for i, result in enumerate(results):
if result["success"]:
print(f"URL {i+1}: {len(result['data']['markdown'])} characters")
# Run
asyncio.run(main())
Integration with Data Processing Libraries
python
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
def scrape_and_extract(url: str, schema: dict) -> dict:
"""Scrape a URL and extract structured data"""
result = client.scrape(url, extract={"schema": schema})
if result["success"]:
return {
"url": url,
"success": True,
"data": json.loads(result["data"]["extract"])
}
return {"url": url, "success": False, "error": result.get("error")}
# Product schema for e-commerce scraping
product_schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"rating": {"type": "number"},
"reviews_count": {"type": "number"}
}
}
# URLs to scrape
product_urls = [
"https://store.example.com/product/1",
"https://store.example.com/product/2",
# ... more URLs
]
# Parallel scraping
results = []
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {
executor.submit(scrape_and_extract, url, product_schema): url
for url in product_urls
}
for future in as_completed(future_to_url):
result = future.result()
results.append(result)
# Convert to DataFrame for analysis
successful_results = [r for r in results if r["success"]]
df = pd.DataFrame([r["data"] for r in successful_results])
print(df.describe())
Want to Contribute?
We welcome contributions to the Python SDK development! If you're interested in helping build the official Python SDK, please:
- Join the Discussion: Reach out to us at [email protected]
- Share Requirements: Tell us what features are most important for your Python use cases
Stay Updated
- 📧 Email: [email protected]
- 💬 Discord: Join our community
- 🐙 GitHub: Watch our repositories for updates
In the meantime, the REST API examples above provide a solid foundation for using LLMCrawl in your Python applications!