# Respectful crawling for news sites DOWNLOAD_DELAY = 2 RANDOMIZE_DOWNLOAD_DELAY = 0.5 CONCURRENT_REQUESTS_PER_DOMAIN = 2
# Custom validation pipeline class ArticleValidationPipeline:
- def process_item(self, item, spider):
# Ensure minimum content length if len(item.get(‘content’, ‘’)) < 100:
raise DropItem(f”Article too short: {item.get(‘title’)}”)
# Ensure required fields required_fields = [‘title’, ‘content’, ‘article_url’] for field in required_fields:
- if not item.get(field):
raise DropItem(f”Missing required field: {field}”)
return item
Running with Parameters
# Scrape different categories
scrapy crawl articles -a category=technology
scrapy crawl articles -a category=business
scrapy crawl articles -a category=sports
# With custom job ID
scrapy crawl articles -a category=technology -s JOB_ID=tech_news_morning
Monitoring Example
Real-time monitoring setup for production use.
Monitoring Spider
# monitoring/spiders/health_check.py
import scrapy
import psycopg2
from datetime import datetime, timedelta
class HealthCheckSpider(scrapy.Spider):
name = 'health_check'
def start_requests(self):
# This spider doesn't make HTTP requests
# It just checks database health
yield scrapy.Request('data:,', self.check_database_health)
def check_database_health(self, response):
"""Check database connectivity and recent activity"""
try:
conn = psycopg2.connect(self.settings.get('DB_URL'))
cursor = conn.cursor()
# Check recent activity (last 24 hours)
cursor.execute("""
SELECT
job_id,
COUNT(*) as item_count,
MAX(created_at) as last_activity
FROM job_items
WHERE created_at > NOW() - INTERVAL '24 hours'
GROUP BY job_id
""")
recent_jobs = cursor.fetchall()
# Check for errors in logs
cursor.execute("""
SELECT COUNT(*)
FROM job_logs
WHERE type = 'ERROR'
AND created_at > NOW() - INTERVAL '1 hour'
""")
recent_errors = cursor.fetchone()[0]
health_report = {
'timestamp': datetime.now().isoformat(),
'database_connected': True,
'recent_jobs': len(recent_jobs),
'total_items_24h': sum(job[1] for job in recent_jobs),
'recent_errors_1h': recent_errors,
'status': 'healthy' if recent_errors < 10 else 'warning'
}
self.logger.info(f"Health check completed: {health_report}")
yield health_report
except Exception as e:
self.logger.error(f"Health check failed: {e}")
yield {
'timestamp': datetime.now().isoformat(),
'database_connected': False,
'error': str(e),
'status': 'critical'
}
Data Analysis Queries
Common queries for analyzing scraped data:
- Basic Setup Examples
– Daily scraping summary SELECT
DATE(created_at) as date, job_id, COUNT(*) as items_scraped
FROM job_items WHERE created_at > CURRENT_DATE - INTERVAL ‘7 days’ GROUP BY date, job_id ORDER BY date DESC, items_scraped DESC;
– Performance analysis SELECT
SPLIT_PART(url, ‘/’, 3) as domain, AVG(response_time) as avg_response_time, COUNT(*) as request_count, COUNT(CASE WHEN status_code >= 400 THEN 1 END) as error_count
FROM job_requests WHERE created_at > CURRENT_DATE - INTERVAL ‘1 day’ GROUP BY domain ORDER BY request_count DESC;
– Content analysis for news articles SELECT
item->>’category’ as category, COUNT(*) as article_count, AVG((item->>’word_count’)::int) as avg_word_count, AVG((item->>’reading_time_minutes’)::int) as avg_reading_time
FROM job_items WHERE item->>’category’ IS NOT NULL GROUP BY category ORDER BY article_count DESC;
Next Steps
Advanced Configurations - More complex setup examples
Production Deployment - Production-ready configurations
Troubleshooting - Common issues and solutions
This section provides complete, working examples for common Scrapy Item Ingest setups. Each example includes the complete spider code, settings configuration, and expected database output.
Simple E-commerce Scraper
A basic spider that scrapes product information and stores it in PostgreSQL.
Project Structure
ecommerce_scraper/
├── scrapy.cfg
├── ecommerce_scraper/
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders/
│ ├── __init__.py
│ └── products.py
└── requirements.txt
Items Definition
# ecommerce_scraper/items.py
import scrapy
from scrapy import Item, Field
class ProductItem(Item):
name = Field()
price = Field()
description = Field()
category = Field()
brand = Field()
availability = Field()
rating = Field()
review_count = Field()
image_urls = Field()
product_url = Field()
Spider Implementation
# ecommerce_scraper/spiders/products.py
import scrapy
from ecommerce_scraper.items import ProductItem
class ProductsSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['example-store.com']
start_urls = ['https://example-store.com/products']
def parse(self, response):
"""Parse category pages and extract product links"""
# Extract product URLs
product_links = response.css('.product-item a::attr(href)').getall()
for link in product_links:
yield response.follow(link, self.parse_product)
# Follow pagination
next_page = response.css('.pagination .next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
def parse_product(self, response):
"""Extract product details"""
item = ProductItem()
item['name'] = response.css('h1.product-title::text').get()
item['price'] = self.extract_price(response.css('.price::text').get())
item['description'] = response.css('.product-description::text').get()
item['category'] = response.css('.breadcrumb li:last-child::text').get()
item['brand'] = response.css('.product-brand::text').get()
item['availability'] = response.css('.availability::text').get()
item['rating'] = self.extract_rating(response.css('.rating::attr(data-rating)').get())
item['review_count'] = self.extract_number(response.css('.review-count::text').get())
item['image_urls'] = response.css('.product-images img::attr(src)').getall()
item['product_url'] = response.url
yield item
def extract_price(self, price_text):
"""Clean and extract numeric price"""
if not price_text:
return None
import re
match = re.search(r'[\d,]+\.?\d*', price_text.replace(',', ''))
return float(match.group()) if match else None
def extract_rating(self, rating_text):
"""Extract numeric rating"""
if not rating_text:
return None
try:
return float(rating_text)
except ValueError:
return None
def extract_number(self, text):
"""Extract number from text"""
if not text:
return None
import re
match = re.search(r'\d+', text)
return int(match.group()) if match else None
Settings Configuration
# ecommerce_scraper/settings.py
import os
from dotenv import load_dotenv
load_dotenv()
# Scrapy settings
BOT_NAME = 'ecommerce_scraper'
SPIDER_MODULES = ['ecommerce_scraper.spiders']
NEWSPIDER_MODULE = 'ecommerce_scraper.spiders'
# Database configuration
DB_URL = os.getenv('DATABASE_URL', 'postgresql://user:password@localhost:5432/ecommerce')
CREATE_TABLES = True
JOB_ID = f'products_{int(time.time())}'
# Pipeline configuration
ITEM_PIPELINES = {
'scrapy_item_ingest.DbInsertPipeline': 300,
}
# Extension configuration
EXTENSIONS = {
'scrapy_item_ingest.LoggingExtension': 500,
}
# Scrapy performance settings
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
# User agent
USER_AGENT = 'ecommerce_scraper (+http://www.yourdomain.com)'
# Enable AutoThrottle for respectful crawling
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
Requirements File
# requirements.txt
scrapy>=2.5.0
scrapy-item-ingest
python-dotenv
psycopg2-binary
Environment Setup
# .env file
DATABASE_URL=postgresql://scrapy_user:secure_password@localhost:5432/ecommerce_db
Running the Spider
# Install dependencies
pip install -r requirements.txt
# Run the spider
scrapy crawl products
# Run with custom settings
scrapy crawl products -s JOB_ID=products_batch_001
Expected Database Output
After running the spider, your database will contain:
job_items table:
{
"id": 1,
"item": {
"name": "Wireless Bluetooth Headphones",
"price": 79.99,
"description": "High-quality wireless headphones with noise cancellation",
"category": "Electronics",
"brand": "TechBrand",
"availability": "In Stock",
"rating": 4.5,
"review_count": 234,
"image_urls": ["https://example.com/image1.jpg", "https://example.com/image2.jpg"],
"product_url": "https://example-store.com/wireless-headphones"
},
"created_at": "2025-07-21T10:30:00.123456Z",
"job_id": 1
}
job_requests table:
| id | url | method | status_code | response_time |
|----|------------------------------------------|--------|-------------|---------------|
| 1 | https://example-store.com/products | GET | 200 | 0.245 |
| 2 | https://example-store.com/headphones | GET | 200 | 0.189 |
job_logs table:
| id | type | message | created_at |
|----|------|-----------------------------------|-------------------------|
| 1 | INFO | Spider opened: products | 2025-07-21 10:30:00 |
| 2 | INFO | Successfully processed item | 2025-07-21 10:30:15 |
News Article Scraper
A more complex example that scrapes news articles with full-text content.
Spider Implementation
# news_scraper/spiders/articles.py
import scrapy
from datetime import datetime
import re
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['news-site.com']
def __init__(self, category='technology', *args, **kwargs):
super().__init__(*args, **kwargs)
self.category = category
self.start_urls = [f'https://news-site.com/{category}']
def parse(self, response):
"""Parse article listing pages"""
articles = response.css('.article-preview')
for article in articles:
article_url = article.css('a::attr(href)').get()
if article_url:
yield response.follow(
article_url,
self.parse_article,
meta={'category': self.category}
)
# Follow pagination
next_page = response.css('.pagination .next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
def parse_article(self, response):
"""Extract full article content"""
# Extract article metadata
title = response.css('h1.article-title::text').get()
author = response.css('.author-name::text').get()
publish_date = self.extract_date(response.css('.publish-date::text').get())
# Extract article content
content_paragraphs = response.css('.article-content p::text').getall()
full_content = '\n'.join(content_paragraphs)
# Extract tags
tags = response.css('.article-tags .tag::text').getall()
# Calculate reading time (average 200 words per minute)
word_count = len(full_content.split())
reading_time = max(1, round(word_count / 200))
item = {
'title': title,
'author': author,
'publish_date': publish_date,
'category': response.meta.get('category'),
'content': full_content,
'word_count': word_count,
'reading_time_minutes': reading_time,
'tags': tags,
'article_url': response.url,
'scraped_at': datetime.now().isoformat(),
}
self.logger.info(f"Scraped article: {title} ({word_count} words)")
yield item
def extract_date(self, date_text):
"""Extract and normalize date"""
if not date_text:
return None
# Handle different date formats
patterns = [
r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD
r'(\d{1,2}/\d{1,2}/\d{4})', # MM/DD/YYYY
r'(\w+ \d{1,2}, \d{4})', # Month DD, YYYY
]
for pattern in patterns:
match = re.search(pattern, date_text)
if match:
return match.group(1)
return date_text
Settings for News Scraper
# news_scraper/settings.py
BOT_NAME = 'news_scraper'
# Database configuration
DB_URL = 'postgresql://user:password@localhost:5432/news_db'
CREATE_TABLES = True
JOB_ID = 'news_daily_scrape'
# Pipelines
ITEM_PIPELINES = {
'news_scraper.pipelines.ArticleValidationPipeline': 200,
'scrapy_item_ingest.DbInsertPipeline': 300,
}
EXTENSIONS = {
'scrapy_item_ingest.LoggingExtension': 500,
}