Skip to content

Examples & Use Cases

Explore practical examples of using LLMCrawl for various scraping and data extraction tasks.

E-commerce & Product Data

Product Information Extraction

Extract structured product data from e-commerce websites:

typescript
import { LLMCrawl } from "@llmcrawl/llmcrawl-js";

const client = new LLMCrawl({ apiKey: "your-api-key" });

const productSchema = {
  type: "object",
  properties: {
    name: { type: "string" },
    price: { type: "number" },
    originalPrice: { type: "number" },
    discount: { type: "number" },
    rating: { type: "number" },
    reviewCount: { type: "number" },
    inStock: { type: "boolean" },
    description: { type: "string" },
    specifications: {
      type: "object",
      properties: {
        brand: { type: "string" },
        model: { type: "string" },
        color: { type: "string" },
        size: { type: "string" },
      },
    },
    images: {
      type: "array",
      items: { type: "string" },
    },
  },
  required: ["name", "price", "inStock"],
};

const result = await client.scrape("https://store.example.com/product/123", {
  formats: ["markdown"],
  extract: { schema: productSchema },
});

if (result.success) {
  const product = JSON.parse(result.data.extract);
  console.log(`Product: ${product.name}`);
  console.log(`Price: $${product.price}`);
  console.log(`In Stock: ${product.inStock}`);
}

Price Monitoring

Monitor product prices across multiple websites:

typescript
interface Product {
  name: string;
  price: number;
  url: string;
  timestamp: Date;
}

async function monitorPrices(productUrls: string[]): Promise<Product[]> {
  const priceSchema = {
    type: "object",
    properties: {
      name: { type: "string" },
      price: { type: "number" },
      currency: { type: "string" },
    },
    required: ["name", "price"],
  };

  const products: Product[] = [];

  for (const url of productUrls) {
    try {
      const result = await client.scrape(url, {
        extract: { schema: priceSchema },
      });

      if (result.success && result.data.extract) {
        const data = JSON.parse(result.data.extract);
        products.push({
          name: data.name,
          price: data.price,
          url,
          timestamp: new Date(),
        });
      }
    } catch (error) {
      console.error(`Failed to scrape ${url}:`, error);
    }

    // Rate limiting
    await new Promise((resolve) => setTimeout(resolve, 1000));
  }

  return products;
}

// Usage
const productUrls = [
  "https://store1.example.com/product/123",
  "https://store2.example.com/item/456",
  "https://store3.example.com/product/789",
];

const prices = await monitorPrices(productUrls);
console.log("Current prices:", prices);

News & Content Aggregation

News Article Extraction

Extract structured data from news articles:

typescript
const articleSchema = {
  type: "object",
  properties: {
    headline: { type: "string" },
    subheadline: { type: "string" },
    author: { type: "string" },
    publishDate: { type: "string" },
    content: { type: "string" },
    tags: { type: "array", items: { type: "string" } },
    category: { type: "string" },
    readTime: { type: "number" },
    relatedArticles: {
      type: "array",
      items: {
        type: "object",
        properties: {
          title: { type: "string" },
          url: { type: "string" },
        },
      },
    },
  },
  required: ["headline", "content"],
};

const newsResult = await client.scrape("https://news.example.com/article/123", {
  formats: ["markdown"],
  extract: { schema: articleSchema },
});

if (newsResult.success) {
  const article = JSON.parse(newsResult.data.extract);
  console.log(`Article: ${article.headline}`);
  console.log(`Author: ${article.author}`);
  console.log(`Published: ${article.publishDate}`);
}

Blog Content Crawling

Crawl and extract content from blog sites:

typescript
const blogCrawl = await client.crawl("https://blog.example.com", {
  limit: 200,
  includePaths: ["/posts/*", "/articles/*"],
  excludePaths: ["/admin/*", "/author/*"],
  scrapeOptions: {
    formats: ["markdown"],
    extract: {
      schema: {
        type: "object",
        properties: {
          title: { type: "string" },
          author: { type: "string" },
          publishDate: { type: "string" },
          content: { type: "string" },
          tags: { type: "array", items: { type: "string" } },
          summary: { type: "string" },
        },
      },
    },
  },
});

// Monitor progress
if (blogCrawl.success) {
  let status = await client.getCrawlStatus(blogCrawl.id);

  while (status.success && status.status === "scraping") {
    console.log(`Progress: ${status.completed}/${status.total} articles`);
    await new Promise((resolve) => setTimeout(resolve, 10000));
    status = await client.getCrawlStatus(blogCrawl.id);
  }

  if (status.success && status.status === "completed") {
    console.log(`Successfully crawled ${status.data.length} articles`);

    // Process articles
    const articles = status.data
      .filter((page) => page.extract)
      .map((page) => JSON.parse(page.extract));

    // Create content database
    const contentDB = articles.map((article, index) => ({
      id: index + 1,
      title: article.title,
      author: article.author,
      publishDate: article.publishDate,
      wordCount: article.content.split(" ").length,
      tags: article.tags || [],
    }));

    console.log("Content database created:", contentDB.length, "entries");
  }
}

Documentation & Knowledge Base

API Documentation Scraping

Extract API documentation with code examples:

typescript
const apiDocSchema = {
  type: "object",
  properties: {
    title: { type: "string" },
    description: { type: "string" },
    endpoint: { type: "string" },
    method: { type: "string" },
    parameters: {
      type: "array",
      items: {
        type: "object",
        properties: {
          name: { type: "string" },
          type: { type: "string" },
          required: { type: "boolean" },
          description: { type: "string" },
        },
      },
    },
    responseExample: { type: "string" },
    codeExamples: {
      type: "array",
      items: {
        type: "object",
        properties: {
          language: { type: "string" },
          code: { type: "string" },
        },
      },
    },
  },
};

const docsCrawl = await client.crawl("https://docs.api.example.com", {
  limit: 500,
  includePaths: ["/reference/*", "/endpoints/*"],
  scrapeOptions: {
    formats: ["markdown"],
    extract: { schema: apiDocSchema },
  },
});

Knowledge Base Creation

Build a searchable knowledge base from documentation:

typescript
interface KnowledgeEntry {
  id: string;
  title: string;
  content: string;
  url: string;
  section: string;
  keywords: string[];
}

async function buildKnowledgeBase(baseUrl: string): Promise<KnowledgeEntry[]> {
  const crawl = await client.crawl(baseUrl, {
    limit: 1000,
    includePaths: ["/docs/*", "/guides/*", "/tutorials/*"],
    scrapeOptions: {
      formats: ["markdown"],
      extract: {
        schema: {
          type: "object",
          properties: {
            title: { type: "string" },
            section: { type: "string" },
            content: { type: "string" },
            keywords: { type: "array", items: { type: "string" } },
          },
        },
      },
    },
  });

  if (!crawl.success) return [];

  // Wait for completion
  let status = await client.getCrawlStatus(crawl.id);
  while (status.success && status.status === "scraping") {
    await new Promise((resolve) => setTimeout(resolve, 5000));
    status = await client.getCrawlStatus(crawl.id);
  }

  if (!status.success || status.status !== "completed") return [];

  // Process results
  const knowledgeBase: KnowledgeEntry[] = status.data
    .filter((page) => page.extract && page.markdown)
    .map((page, index) => {
      const extracted = JSON.parse(page.extract);
      return {
        id: `kb_${index + 1}`,
        title: extracted.title || page.metadata?.title || "Untitled",
        content: page.markdown,
        url: page.metadata?.url || "",
        section: extracted.section || "General",
        keywords: extracted.keywords || [],
      };
    });

  return knowledgeBase;
}

// Usage
const kb = await buildKnowledgeBase("https://docs.myapp.com");
console.log(`Knowledge base created with ${kb.length} entries`);

Real Estate & Property Data

Property Listing Extraction

Extract property details from real estate websites:

typescript
const propertySchema = {
  type: "object",
  properties: {
    title: { type: "string" },
    price: { type: "number" },
    address: { type: "string" },
    bedrooms: { type: "number" },
    bathrooms: { type: "number" },
    sqft: { type: "number" },
    propertyType: { type: "string" },
    description: { type: "string" },
    features: { type: "array", items: { type: "string" } },
    images: { type: "array", items: { type: "string" } },
    agent: {
      type: "object",
      properties: {
        name: { type: "string" },
        phone: { type: "string" },
        email: { type: "string" },
      },
    },
  },
  required: ["title", "price", "address"],
};

const property = await client.scrape("https://realty.example.com/listing/123", {
  formats: ["markdown"],
  extract: { schema: propertySchema },
});

Market Analysis

Analyze property market trends:

typescript
async function analyzeMarket(searchUrls: string[]) {
  const properties = [];

  for (const url of searchUrls) {
    const result = await client.scrape(url, {
      extract: { schema: propertySchema },
    });

    if (result.success && result.data.extract) {
      const property = JSON.parse(result.data.extract);
      properties.push(property);
    }

    await new Promise((resolve) => setTimeout(resolve, 2000));
  }

  // Calculate market metrics
  const prices = properties.map((p) => p.price).filter((p) => p > 0);
  const avgPrice = prices.reduce((a, b) => a + b, 0) / prices.length;
  const medianPrice = prices.sort((a, b) => a - b)[
    Math.floor(prices.length / 2)
  ];

  return {
    totalProperties: properties.length,
    averagePrice: avgPrice,
    medianPrice: medianPrice,
    priceRange: {
      min: Math.min(...prices),
      max: Math.max(...prices),
    },
    propertyTypes: [...new Set(properties.map((p) => p.propertyType))],
  };
}

Job & Career Data

Job Listing Aggregation

Extract job postings from career sites:

typescript
const jobSchema = {
  type: "object",
  properties: {
    title: { type: "string" },
    company: { type: "string" },
    location: { type: "string" },
    salary: { type: "string" },
    type: { type: "string" }, // full-time, part-time, contract
    remote: { type: "boolean" },
    description: { type: "string" },
    requirements: { type: "array", items: { type: "string" } },
    benefits: { type: "array", items: { type: "string" } },
    postedDate: { type: "string" },
    applicationUrl: { type: "string" },
  },
  required: ["title", "company"],
};

// Crawl job boards
const jobsCrawl = await client.crawl("https://jobs.example.com", {
  limit: 1000,
  includePaths: ["/jobs/*", "/careers/*"],
  excludePaths: ["/apply/*", "/profile/*"],
  scrapeOptions: {
    formats: ["markdown"],
    extract: { schema: jobSchema },
  },
});

Social Media & Reviews

Review Extraction

Extract customer reviews and ratings:

typescript
const reviewSchema = {
  type: "object",
  properties: {
    rating: { type: "number" },
    title: { type: "string" },
    content: { type: "string" },
    author: { type: "string" },
    date: { type: "string" },
    verified: { type: "boolean" },
    helpful: { type: "number" },
    product: { type: "string" },
  },
};

const reviews = await client.scrape("https://reviews.example.com/product/123", {
  extract: { schema: reviewSchema },
});

Financial Data

Stock & Financial Information

Extract financial data from company pages:

typescript
const financialSchema = {
  type: "object",
  properties: {
    symbol: { type: "string" },
    companyName: { type: "string" },
    currentPrice: { type: "number" },
    change: { type: "number" },
    changePercent: { type: "number" },
    volume: { type: "number" },
    marketCap: { type: "string" },
    peRatio: { type: "number" },
    dividendYield: { type: "number" },
    earningsDate: { type: "string" },
  },
};

const stockData = await client.scrape(
  "https://finance.example.com/stock/AAPL",
  {
    extract: { schema: financialSchema },
  }
);

Advanced Patterns

Retry Logic with Exponential Backoff

typescript
async function scrapeWithRetry(
  url: string,
  options: any,
  maxRetries = 3
): Promise<any> {
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      const result = await client.scrape(url, options);
      if (result.success) return result;

      if (attempt < maxRetries) {
        const delay = Math.pow(2, attempt) * 1000; // Exponential backoff
        await new Promise((resolve) => setTimeout(resolve, delay));
      }
    } catch (error) {
      if (attempt === maxRetries) throw error;
      const delay = Math.pow(2, attempt) * 1000;
      await new Promise((resolve) => setTimeout(resolve, delay));
    }
  }
}

Batch Processing with Concurrency Control

typescript
async function batchScrape(urls: string[], concurrency = 5) {
  const results = [];

  for (let i = 0; i < urls.length; i += concurrency) {
    const batch = urls.slice(i, i + concurrency);
    const batchResults = await Promise.allSettled(
      batch.map((url) => client.scrape(url))
    );

    results.push(...batchResults);

    // Rate limiting between batches
    if (i + concurrency < urls.length) {
      await new Promise((resolve) => setTimeout(resolve, 1000));
    }
  }

  return results;
}

Data Validation & Cleaning

typescript
import Ajv from "ajv";

const ajv = new Ajv();

function validateAndCleanData(data: any, schema: any) {
  const validate = ajv.compile(schema);
  const valid = validate(data);

  if (!valid) {
    console.warn("Validation errors:", validate.errors);
    // Attempt to clean/fix data
    return cleanData(data, validate.errors);
  }

  return data;
}

function cleanData(data: any, errors: any[]) {
  // Implement data cleaning logic based on validation errors
  const cleaned = { ...data };

  errors.forEach((error) => {
    if (error.keyword === "type" && error.params.type === "number") {
      const path = error.instancePath.replace("/", "");
      if (typeof cleaned[path] === "string") {
        const num = parseFloat(cleaned[path].replace(/[^0-9.-]/g, ""));
        if (!isNaN(num)) cleaned[path] = num;
      }
    }
  });

  return cleaned;
}

These examples demonstrate the versatility of LLMCrawl for various data extraction and web scraping use cases. The AI-powered extraction feature makes it easy to transform unstructured web content into structured, usable data for your applications.