Appearance
Examples & Use Cases
Explore practical examples of using LLMCrawl for various scraping and data extraction tasks.
E-commerce & Product Data
Product Information Extraction
Extract structured product data from e-commerce websites:
typescript
import { LLMCrawl } from "@llmcrawl/llmcrawl-js";
const client = new LLMCrawl({ apiKey: "your-api-key" });
const productSchema = {
type: "object",
properties: {
name: { type: "string" },
price: { type: "number" },
originalPrice: { type: "number" },
discount: { type: "number" },
rating: { type: "number" },
reviewCount: { type: "number" },
inStock: { type: "boolean" },
description: { type: "string" },
specifications: {
type: "object",
properties: {
brand: { type: "string" },
model: { type: "string" },
color: { type: "string" },
size: { type: "string" },
},
},
images: {
type: "array",
items: { type: "string" },
},
},
required: ["name", "price", "inStock"],
};
const result = await client.scrape("https://store.example.com/product/123", {
formats: ["markdown"],
extract: { schema: productSchema },
});
if (result.success) {
const product = JSON.parse(result.data.extract);
console.log(`Product: ${product.name}`);
console.log(`Price: $${product.price}`);
console.log(`In Stock: ${product.inStock}`);
}
Price Monitoring
Monitor product prices across multiple websites:
typescript
interface Product {
name: string;
price: number;
url: string;
timestamp: Date;
}
async function monitorPrices(productUrls: string[]): Promise<Product[]> {
const priceSchema = {
type: "object",
properties: {
name: { type: "string" },
price: { type: "number" },
currency: { type: "string" },
},
required: ["name", "price"],
};
const products: Product[] = [];
for (const url of productUrls) {
try {
const result = await client.scrape(url, {
extract: { schema: priceSchema },
});
if (result.success && result.data.extract) {
const data = JSON.parse(result.data.extract);
products.push({
name: data.name,
price: data.price,
url,
timestamp: new Date(),
});
}
} catch (error) {
console.error(`Failed to scrape ${url}:`, error);
}
// Rate limiting
await new Promise((resolve) => setTimeout(resolve, 1000));
}
return products;
}
// Usage
const productUrls = [
"https://store1.example.com/product/123",
"https://store2.example.com/item/456",
"https://store3.example.com/product/789",
];
const prices = await monitorPrices(productUrls);
console.log("Current prices:", prices);
News & Content Aggregation
News Article Extraction
Extract structured data from news articles:
typescript
const articleSchema = {
type: "object",
properties: {
headline: { type: "string" },
subheadline: { type: "string" },
author: { type: "string" },
publishDate: { type: "string" },
content: { type: "string" },
tags: { type: "array", items: { type: "string" } },
category: { type: "string" },
readTime: { type: "number" },
relatedArticles: {
type: "array",
items: {
type: "object",
properties: {
title: { type: "string" },
url: { type: "string" },
},
},
},
},
required: ["headline", "content"],
};
const newsResult = await client.scrape("https://news.example.com/article/123", {
formats: ["markdown"],
extract: { schema: articleSchema },
});
if (newsResult.success) {
const article = JSON.parse(newsResult.data.extract);
console.log(`Article: ${article.headline}`);
console.log(`Author: ${article.author}`);
console.log(`Published: ${article.publishDate}`);
}
Blog Content Crawling
Crawl and extract content from blog sites:
typescript
const blogCrawl = await client.crawl("https://blog.example.com", {
limit: 200,
includePaths: ["/posts/*", "/articles/*"],
excludePaths: ["/admin/*", "/author/*"],
scrapeOptions: {
formats: ["markdown"],
extract: {
schema: {
type: "object",
properties: {
title: { type: "string" },
author: { type: "string" },
publishDate: { type: "string" },
content: { type: "string" },
tags: { type: "array", items: { type: "string" } },
summary: { type: "string" },
},
},
},
},
});
// Monitor progress
if (blogCrawl.success) {
let status = await client.getCrawlStatus(blogCrawl.id);
while (status.success && status.status === "scraping") {
console.log(`Progress: ${status.completed}/${status.total} articles`);
await new Promise((resolve) => setTimeout(resolve, 10000));
status = await client.getCrawlStatus(blogCrawl.id);
}
if (status.success && status.status === "completed") {
console.log(`Successfully crawled ${status.data.length} articles`);
// Process articles
const articles = status.data
.filter((page) => page.extract)
.map((page) => JSON.parse(page.extract));
// Create content database
const contentDB = articles.map((article, index) => ({
id: index + 1,
title: article.title,
author: article.author,
publishDate: article.publishDate,
wordCount: article.content.split(" ").length,
tags: article.tags || [],
}));
console.log("Content database created:", contentDB.length, "entries");
}
}
Documentation & Knowledge Base
API Documentation Scraping
Extract API documentation with code examples:
typescript
const apiDocSchema = {
type: "object",
properties: {
title: { type: "string" },
description: { type: "string" },
endpoint: { type: "string" },
method: { type: "string" },
parameters: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
type: { type: "string" },
required: { type: "boolean" },
description: { type: "string" },
},
},
},
responseExample: { type: "string" },
codeExamples: {
type: "array",
items: {
type: "object",
properties: {
language: { type: "string" },
code: { type: "string" },
},
},
},
},
};
const docsCrawl = await client.crawl("https://docs.api.example.com", {
limit: 500,
includePaths: ["/reference/*", "/endpoints/*"],
scrapeOptions: {
formats: ["markdown"],
extract: { schema: apiDocSchema },
},
});
Knowledge Base Creation
Build a searchable knowledge base from documentation:
typescript
interface KnowledgeEntry {
id: string;
title: string;
content: string;
url: string;
section: string;
keywords: string[];
}
async function buildKnowledgeBase(baseUrl: string): Promise<KnowledgeEntry[]> {
const crawl = await client.crawl(baseUrl, {
limit: 1000,
includePaths: ["/docs/*", "/guides/*", "/tutorials/*"],
scrapeOptions: {
formats: ["markdown"],
extract: {
schema: {
type: "object",
properties: {
title: { type: "string" },
section: { type: "string" },
content: { type: "string" },
keywords: { type: "array", items: { type: "string" } },
},
},
},
},
});
if (!crawl.success) return [];
// Wait for completion
let status = await client.getCrawlStatus(crawl.id);
while (status.success && status.status === "scraping") {
await new Promise((resolve) => setTimeout(resolve, 5000));
status = await client.getCrawlStatus(crawl.id);
}
if (!status.success || status.status !== "completed") return [];
// Process results
const knowledgeBase: KnowledgeEntry[] = status.data
.filter((page) => page.extract && page.markdown)
.map((page, index) => {
const extracted = JSON.parse(page.extract);
return {
id: `kb_${index + 1}`,
title: extracted.title || page.metadata?.title || "Untitled",
content: page.markdown,
url: page.metadata?.url || "",
section: extracted.section || "General",
keywords: extracted.keywords || [],
};
});
return knowledgeBase;
}
// Usage
const kb = await buildKnowledgeBase("https://docs.myapp.com");
console.log(`Knowledge base created with ${kb.length} entries`);
Real Estate & Property Data
Property Listing Extraction
Extract property details from real estate websites:
typescript
const propertySchema = {
type: "object",
properties: {
title: { type: "string" },
price: { type: "number" },
address: { type: "string" },
bedrooms: { type: "number" },
bathrooms: { type: "number" },
sqft: { type: "number" },
propertyType: { type: "string" },
description: { type: "string" },
features: { type: "array", items: { type: "string" } },
images: { type: "array", items: { type: "string" } },
agent: {
type: "object",
properties: {
name: { type: "string" },
phone: { type: "string" },
email: { type: "string" },
},
},
},
required: ["title", "price", "address"],
};
const property = await client.scrape("https://realty.example.com/listing/123", {
formats: ["markdown"],
extract: { schema: propertySchema },
});
Market Analysis
Analyze property market trends:
typescript
async function analyzeMarket(searchUrls: string[]) {
const properties = [];
for (const url of searchUrls) {
const result = await client.scrape(url, {
extract: { schema: propertySchema },
});
if (result.success && result.data.extract) {
const property = JSON.parse(result.data.extract);
properties.push(property);
}
await new Promise((resolve) => setTimeout(resolve, 2000));
}
// Calculate market metrics
const prices = properties.map((p) => p.price).filter((p) => p > 0);
const avgPrice = prices.reduce((a, b) => a + b, 0) / prices.length;
const medianPrice = prices.sort((a, b) => a - b)[
Math.floor(prices.length / 2)
];
return {
totalProperties: properties.length,
averagePrice: avgPrice,
medianPrice: medianPrice,
priceRange: {
min: Math.min(...prices),
max: Math.max(...prices),
},
propertyTypes: [...new Set(properties.map((p) => p.propertyType))],
};
}
Job & Career Data
Job Listing Aggregation
Extract job postings from career sites:
typescript
const jobSchema = {
type: "object",
properties: {
title: { type: "string" },
company: { type: "string" },
location: { type: "string" },
salary: { type: "string" },
type: { type: "string" }, // full-time, part-time, contract
remote: { type: "boolean" },
description: { type: "string" },
requirements: { type: "array", items: { type: "string" } },
benefits: { type: "array", items: { type: "string" } },
postedDate: { type: "string" },
applicationUrl: { type: "string" },
},
required: ["title", "company"],
};
// Crawl job boards
const jobsCrawl = await client.crawl("https://jobs.example.com", {
limit: 1000,
includePaths: ["/jobs/*", "/careers/*"],
excludePaths: ["/apply/*", "/profile/*"],
scrapeOptions: {
formats: ["markdown"],
extract: { schema: jobSchema },
},
});
Social Media & Reviews
Review Extraction
Extract customer reviews and ratings:
typescript
const reviewSchema = {
type: "object",
properties: {
rating: { type: "number" },
title: { type: "string" },
content: { type: "string" },
author: { type: "string" },
date: { type: "string" },
verified: { type: "boolean" },
helpful: { type: "number" },
product: { type: "string" },
},
};
const reviews = await client.scrape("https://reviews.example.com/product/123", {
extract: { schema: reviewSchema },
});
Financial Data
Stock & Financial Information
Extract financial data from company pages:
typescript
const financialSchema = {
type: "object",
properties: {
symbol: { type: "string" },
companyName: { type: "string" },
currentPrice: { type: "number" },
change: { type: "number" },
changePercent: { type: "number" },
volume: { type: "number" },
marketCap: { type: "string" },
peRatio: { type: "number" },
dividendYield: { type: "number" },
earningsDate: { type: "string" },
},
};
const stockData = await client.scrape(
"https://finance.example.com/stock/AAPL",
{
extract: { schema: financialSchema },
}
);
Advanced Patterns
Retry Logic with Exponential Backoff
typescript
async function scrapeWithRetry(
url: string,
options: any,
maxRetries = 3
): Promise<any> {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const result = await client.scrape(url, options);
if (result.success) return result;
if (attempt < maxRetries) {
const delay = Math.pow(2, attempt) * 1000; // Exponential backoff
await new Promise((resolve) => setTimeout(resolve, delay));
}
} catch (error) {
if (attempt === maxRetries) throw error;
const delay = Math.pow(2, attempt) * 1000;
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
}
Batch Processing with Concurrency Control
typescript
async function batchScrape(urls: string[], concurrency = 5) {
const results = [];
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.allSettled(
batch.map((url) => client.scrape(url))
);
results.push(...batchResults);
// Rate limiting between batches
if (i + concurrency < urls.length) {
await new Promise((resolve) => setTimeout(resolve, 1000));
}
}
return results;
}
Data Validation & Cleaning
typescript
import Ajv from "ajv";
const ajv = new Ajv();
function validateAndCleanData(data: any, schema: any) {
const validate = ajv.compile(schema);
const valid = validate(data);
if (!valid) {
console.warn("Validation errors:", validate.errors);
// Attempt to clean/fix data
return cleanData(data, validate.errors);
}
return data;
}
function cleanData(data: any, errors: any[]) {
// Implement data cleaning logic based on validation errors
const cleaned = { ...data };
errors.forEach((error) => {
if (error.keyword === "type" && error.params.type === "number") {
const path = error.instancePath.replace("/", "");
if (typeof cleaned[path] === "string") {
const num = parseFloat(cleaned[path].replace(/[^0-9.-]/g, ""));
if (!isNaN(num)) cleaned[path] = num;
}
}
});
return cleaned;
}
These examples demonstrate the versatility of LLMCrawl for various data extraction and web scraping use cases. The AI-powered extraction feature makes it easy to transform unstructured web content into structured, usable data for your applications.