# Respectful crawling for news sites DOWNLOAD_DELAY = 2 RANDOMIZE_DOWNLOAD_DELAY = 0.5 CONCURRENT_REQUESTS_PER_DOMAIN = 2 # Custom validation pipeline class ArticleValidationPipeline: def process_item(self, item, spider): # Ensure minimum content length if len(item.get('content', '')) < 100: raise DropItem(f"Article too short: {item.get('title')}") # Ensure required fields required_fields = ['title', 'content', 'article_url'] for field in required_fields: if not item.get(field): raise DropItem(f"Missing required field: {field}") return item Running with Parameters ~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash # Scrape different categories scrapy crawl articles -a category=technology scrapy crawl articles -a category=business scrapy crawl articles -a category=sports # With custom job ID scrapy crawl articles -a category=technology -s JOB_ID=tech_news_morning Monitoring Example ----------------- Real-time monitoring setup for production use. Monitoring Spider ~~~~~~~~~~~~~~~ .. code-block:: python # monitoring/spiders/health_check.py import scrapy import psycopg2 from datetime import datetime, timedelta class HealthCheckSpider(scrapy.Spider): name = 'health_check' def start_requests(self): # This spider doesn't make HTTP requests # It just checks database health yield scrapy.Request('data:,', self.check_database_health) def check_database_health(self, response): """Check database connectivity and recent activity""" try: conn = psycopg2.connect(self.settings.get('DB_URL')) cursor = conn.cursor() # Check recent activity (last 24 hours) cursor.execute(""" SELECT job_id, COUNT(*) as item_count, MAX(created_at) as last_activity FROM job_items WHERE created_at > NOW() - INTERVAL '24 hours' GROUP BY job_id """) recent_jobs = cursor.fetchall() # Check for errors in logs cursor.execute(""" SELECT COUNT(*) FROM job_logs WHERE type = 'ERROR' AND created_at > NOW() - INTERVAL '1 hour' """) recent_errors = cursor.fetchone()[0] health_report = { 'timestamp': datetime.now().isoformat(), 'database_connected': True, 'recent_jobs': len(recent_jobs), 'total_items_24h': sum(job[1] for job in recent_jobs), 'recent_errors_1h': recent_errors, 'status': 'healthy' if recent_errors < 10 else 'warning' } self.logger.info(f"Health check completed: {health_report}") yield health_report except Exception as e: self.logger.error(f"Health check failed: {e}") yield { 'timestamp': datetime.now().isoformat(), 'database_connected': False, 'error': str(e), 'status': 'critical' } Data Analysis Queries ~~~~~~~~~~~~~~~~~~~ Common queries for analyzing scraped data: .. code-block:: sql Basic Setup Examples -- Daily scraping summary SELECT DATE(created_at) as date, job_id, COUNT(*) as items_scraped FROM job_items WHERE created_at > CURRENT_DATE - INTERVAL '7 days' GROUP BY date, job_id ORDER BY date DESC, items_scraped DESC; -- Performance analysis SELECT SPLIT_PART(url, '/', 3) as domain, AVG(response_time) as avg_response_time, COUNT(*) as request_count, COUNT(CASE WHEN status_code >= 400 THEN 1 END) as error_count FROM job_requests WHERE created_at > CURRENT_DATE - INTERVAL '1 day' GROUP BY domain ORDER BY request_count DESC; -- Content analysis for news articles SELECT item->>'category' as category, COUNT(*) as article_count, AVG((item->>'word_count')::int) as avg_word_count, AVG((item->>'reading_time_minutes')::int) as avg_reading_time FROM job_items WHERE item->>'category' IS NOT NULL GROUP BY category ORDER BY article_count DESC; Next Steps ---------- * :doc:`advanced-configurations` - More complex setup examples * :doc:`production-deployment` - Production-ready configurations * :doc:`troubleshooting` - Common issues and solutions =================== This section provides complete, working examples for common Scrapy Item Ingest setups. Each example includes the complete spider code, settings configuration, and expected database output. Simple E-commerce Scraper ------------------------- A basic spider that scrapes product information and stores it in PostgreSQL. Project Structure ~~~~~~~~~~~~~~~~ .. code-block:: text ecommerce_scraper/ ├── scrapy.cfg ├── ecommerce_scraper/ │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders/ │ ├── __init__.py │ └── products.py └── requirements.txt Items Definition ~~~~~~~~~~~~~~~ .. code-block:: python # ecommerce_scraper/items.py import scrapy from scrapy import Item, Field class ProductItem(Item): name = Field() price = Field() description = Field() category = Field() brand = Field() availability = Field() rating = Field() review_count = Field() image_urls = Field() product_url = Field() Spider Implementation ~~~~~~~~~~~~~~~~~~~~ .. code-block:: python # ecommerce_scraper/spiders/products.py import scrapy from ecommerce_scraper.items import ProductItem class ProductsSpider(scrapy.Spider): name = 'products' allowed_domains = ['example-store.com'] start_urls = ['https://example-store.com/products'] def parse(self, response): """Parse category pages and extract product links""" # Extract product URLs product_links = response.css('.product-item a::attr(href)').getall() for link in product_links: yield response.follow(link, self.parse_product) # Follow pagination next_page = response.css('.pagination .next::attr(href)').get() if next_page: yield response.follow(next_page, self.parse) def parse_product(self, response): """Extract product details""" item = ProductItem() item['name'] = response.css('h1.product-title::text').get() item['price'] = self.extract_price(response.css('.price::text').get()) item['description'] = response.css('.product-description::text').get() item['category'] = response.css('.breadcrumb li:last-child::text').get() item['brand'] = response.css('.product-brand::text').get() item['availability'] = response.css('.availability::text').get() item['rating'] = self.extract_rating(response.css('.rating::attr(data-rating)').get()) item['review_count'] = self.extract_number(response.css('.review-count::text').get()) item['image_urls'] = response.css('.product-images img::attr(src)').getall() item['product_url'] = response.url yield item def extract_price(self, price_text): """Clean and extract numeric price""" if not price_text: return None import re match = re.search(r'[\d,]+\.?\d*', price_text.replace(',', '')) return float(match.group()) if match else None def extract_rating(self, rating_text): """Extract numeric rating""" if not rating_text: return None try: return float(rating_text) except ValueError: return None def extract_number(self, text): """Extract number from text""" if not text: return None import re match = re.search(r'\d+', text) return int(match.group()) if match else None Settings Configuration ~~~~~~~~~~~~~~~~~~~~ .. code-block:: python # ecommerce_scraper/settings.py import os from dotenv import load_dotenv load_dotenv() # Scrapy settings BOT_NAME = 'ecommerce_scraper' SPIDER_MODULES = ['ecommerce_scraper.spiders'] NEWSPIDER_MODULE = 'ecommerce_scraper.spiders' # Database configuration DB_URL = os.getenv('DATABASE_URL', 'postgresql://user:password@localhost:5432/ecommerce') CREATE_TABLES = True JOB_ID = f'products_{int(time.time())}' # Pipeline configuration ITEM_PIPELINES = { 'scrapy_item_ingest.DbInsertPipeline': 300, } # Extension configuration EXTENSIONS = { 'scrapy_item_ingest.LoggingExtension': 500, } # Scrapy performance settings ROBOTSTXT_OBEY = True CONCURRENT_REQUESTS = 16 DOWNLOAD_DELAY = 1 RANDOMIZE_DOWNLOAD_DELAY = 0.5 # User agent USER_AGENT = 'ecommerce_scraper (+http://www.yourdomain.com)' # Enable AutoThrottle for respectful crawling AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_START_DELAY = 1 AUTOTHROTTLE_MAX_DELAY = 10 AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0 Requirements File ~~~~~~~~~~~~~~~ .. code-block:: text # requirements.txt scrapy>=2.5.0 scrapy-item-ingest python-dotenv psycopg2-binary Environment Setup ~~~~~~~~~~~~~~~ .. code-block:: bash # .env file DATABASE_URL=postgresql://scrapy_user:secure_password@localhost:5432/ecommerce_db Running the Spider ~~~~~~~~~~~~~~~~~ .. code-block:: bash # Install dependencies pip install -r requirements.txt # Run the spider scrapy crawl products # Run with custom settings scrapy crawl products -s JOB_ID=products_batch_001 Expected Database Output ~~~~~~~~~~~~~~~~~~~~~~ After running the spider, your database will contain: **job_items table:** .. code-block:: json { "id": 1, "item": { "name": "Wireless Bluetooth Headphones", "price": 79.99, "description": "High-quality wireless headphones with noise cancellation", "category": "Electronics", "brand": "TechBrand", "availability": "In Stock", "rating": 4.5, "review_count": 234, "image_urls": ["https://example.com/image1.jpg", "https://example.com/image2.jpg"], "product_url": "https://example-store.com/wireless-headphones" }, "created_at": "2025-07-21T10:30:00.123456Z", "job_id": 1 } **job_requests table:** .. code-block:: text | id | url | method | status_code | response_time | |----|------------------------------------------|--------|-------------|---------------| | 1 | https://example-store.com/products | GET | 200 | 0.245 | | 2 | https://example-store.com/headphones | GET | 200 | 0.189 | **job_logs table:** .. code-block:: text | id | type | message | created_at | |----|------|-----------------------------------|-------------------------| | 1 | INFO | Spider opened: products | 2025-07-21 10:30:00 | | 2 | INFO | Successfully processed item | 2025-07-21 10:30:15 | News Article Scraper ------------------- A more complex example that scrapes news articles with full-text content. Spider Implementation ~~~~~~~~~~~~~~~~~~~ .. code-block:: python # news_scraper/spiders/articles.py import scrapy from datetime import datetime import re class ArticlesSpider(scrapy.Spider): name = 'articles' allowed_domains = ['news-site.com'] def __init__(self, category='technology', *args, **kwargs): super().__init__(*args, **kwargs) self.category = category self.start_urls = [f'https://news-site.com/{category}'] def parse(self, response): """Parse article listing pages""" articles = response.css('.article-preview') for article in articles: article_url = article.css('a::attr(href)').get() if article_url: yield response.follow( article_url, self.parse_article, meta={'category': self.category} ) # Follow pagination next_page = response.css('.pagination .next::attr(href)').get() if next_page: yield response.follow(next_page, self.parse) def parse_article(self, response): """Extract full article content""" # Extract article metadata title = response.css('h1.article-title::text').get() author = response.css('.author-name::text').get() publish_date = self.extract_date(response.css('.publish-date::text').get()) # Extract article content content_paragraphs = response.css('.article-content p::text').getall() full_content = '\n'.join(content_paragraphs) # Extract tags tags = response.css('.article-tags .tag::text').getall() # Calculate reading time (average 200 words per minute) word_count = len(full_content.split()) reading_time = max(1, round(word_count / 200)) item = { 'title': title, 'author': author, 'publish_date': publish_date, 'category': response.meta.get('category'), 'content': full_content, 'word_count': word_count, 'reading_time_minutes': reading_time, 'tags': tags, 'article_url': response.url, 'scraped_at': datetime.now().isoformat(), } self.logger.info(f"Scraped article: {title} ({word_count} words)") yield item def extract_date(self, date_text): """Extract and normalize date""" if not date_text: return None # Handle different date formats patterns = [ r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD r'(\d{1,2}/\d{1,2}/\d{4})', # MM/DD/YYYY r'(\w+ \d{1,2}, \d{4})', # Month DD, YYYY ] for pattern in patterns: match = re.search(pattern, date_text) if match: return match.group(1) return date_text Settings for News Scraper ~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python # news_scraper/settings.py BOT_NAME = 'news_scraper' # Database configuration DB_URL = 'postgresql://user:password@localhost:5432/news_db' CREATE_TABLES = True JOB_ID = 'news_daily_scrape' # Pipelines ITEM_PIPELINES = { 'news_scraper.pipelines.ArticleValidationPipeline': 200, 'scrapy_item_ingest.DbInsertPipeline': 300, } EXTENSIONS = { 'scrapy_item_ingest.LoggingExtension': 500, }