Source code for scrapy_item_ingest.pipelines.base

"""
Base pipeline functionality for scrapy_item_ingest.
"""
import logging

from ..config.settings import Settings, validate_settings
from ..database.connection import DatabaseConnection
from ..database.schema import SchemaManager

logger = logging.getLogger(__name__)


[docs] class BasePipeline: """Base pipeline with common functionality"""
[docs] def __init__(self, settings): self.settings = settings self.db = None self.schema_manager = None validate_settings(settings)
[docs] @classmethod def from_crawler(cls, crawler): """Create pipeline instance from crawler""" settings = Settings(crawler.settings) return cls(settings)
[docs] def open_spider(self, spider): """Called when spider is opened""" # Establish database connection self.db = DatabaseConnection(self.settings.db_url) if not self.db.connect(): raise Exception("Failed to connect to database") # Initialize schema manager self.schema_manager = SchemaManager(self.db, self.settings) # Ensure tables exist self.schema_manager.ensure_tables_exist()
[docs] def close_spider(self, spider): """Called when spider is closed""" if self.db: self.db.close()
[docs] def get_identifier_info(self, spider): """Get identifier column and value for the spider""" return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)