from datetime import datetime, timezone from sqlalchemy import DateTime, ForeignKey, Index, Integer, LargeBinary, String, Text, UniqueConstraint from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.types import JSON from app.db import Base def utcnow() -> datetime: return datetime.now(timezone.utc) json_type = JSON().with_variant(JSONB, "postgresql") class Employee(Base): __tablename__ = "employees" __table_args__ = ( UniqueConstraint("profile_key", name="uq_employees_profile_key"), Index("ix_employees_full_name", "full_name"), Index("ix_employees_status", "status"), ) id: Mapped[int] = mapped_column(Integer, primary_key=True) profile_key: Mapped[str] = mapped_column(String(255), nullable=False) profile_type: Mapped[str | None] = mapped_column(String(50)) profile_id: Mapped[str | None] = mapped_column(String(255)) canonical_url: Mapped[str] = mapped_column(Text, nullable=False) full_name: Mapped[str | None] = mapped_column(Text) status: Mapped[str] = mapped_column(String(32), default="active", nullable=False) first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) last_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) dismissed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) parser_version: Mapped[str | None] = mapped_column(String(32)) current_data: Mapped[dict | None] = mapped_column(json_type) current_checksum: Mapped[str | None] = mapped_column(String(64)) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False) snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee") tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan") publications: Mapped[list["EmployeePublication"]] = relationship(back_populates="employee", cascade="all, delete-orphan") crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee") class EmployeeSnapshot(Base): __tablename__ = "employee_snapshots" __table_args__ = (Index("ix_employee_snapshots_employee_id", "employee_id"),) id: Mapped[int] = mapped_column(Integer, primary_key=True) employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False) crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id")) parsed_data: Mapped[dict] = mapped_column(json_type, nullable=False) html_snapshot: Mapped[bytes | None] = mapped_column(LargeBinary) checksum: Mapped[str] = mapped_column(String(64), nullable=False) parser_version: Mapped[str | None] = mapped_column(String(32)) captured_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) employee: Mapped[Employee] = relationship(back_populates="snapshots") class EmployeePublication(Base): __tablename__ = "employee_publications" __table_args__ = ( UniqueConstraint("employee_id", "publication_id", name="uq_employee_publications_employee_publication"), UniqueConstraint("employee_id", "source_hash", name="uq_employee_publications_employee_source_hash"), Index("ix_employee_publications_employee_id", "employee_id"), Index("ix_employee_publications_publication_id", "publication_id"), Index("ix_employee_publications_doi_url", "doi_url"), Index("ix_employee_publications_year", "year"), Index("ix_employee_publications_publication_type", "publication_type"), ) id: Mapped[int] = mapped_column(Integer, primary_key=True) employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id", ondelete="CASCADE"), nullable=False) publication_id: Mapped[str | None] = mapped_column(String(64)) title: Mapped[str] = mapped_column(Text, nullable=False) year: Mapped[int | None] = mapped_column(Integer) publication_type: Mapped[str | None] = mapped_column(String(64)) language: Mapped[str | None] = mapped_column(String(16)) status: Mapped[int | None] = mapped_column(Integer) url: Mapped[str | None] = mapped_column(Text) doi_url: Mapped[str | None] = mapped_column(Text) other_url: Mapped[str | None] = mapped_column(Text) document_url: Mapped[str | None] = mapped_column(Text) citation_text: Mapped[str | None] = mapped_column(Text) annotation: Mapped[dict | None] = mapped_column(json_type) description: Mapped[dict | None] = mapped_column(json_type) authors: Mapped[list | None] = mapped_column(json_type) raw_data: Mapped[dict | None] = mapped_column(json_type) source_hash: Mapped[str] = mapped_column(String(64), nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False) employee: Mapped[Employee] = relationship(back_populates="publications") class CrawlRun(Base): __tablename__ = "crawl_runs" id: Mapped[int] = mapped_column(Integer, primary_key=True) source_url: Mapped[str] = mapped_column(Text, nullable=False) status: Mapped[str] = mapped_column(String(32), default="running", nullable=False) started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) skipped_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) message: Mapped[str | None] = mapped_column(Text) employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run") dataset_versions: Mapped[list["DatasetVersion"]] = relationship(back_populates="crawl_run") class CrawlRunEmployeeChange(Base): __tablename__ = "crawl_run_employee_changes" __table_args__ = ( Index("ix_crawl_run_employee_changes_run_id", "crawl_run_id"), Index("ix_crawl_run_employee_changes_employee_id", "employee_id"), Index("ix_crawl_run_employee_changes_change_type", "change_type"), ) id: Mapped[int] = mapped_column(Integer, primary_key=True) crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False) employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id")) profile_key: Mapped[str] = mapped_column(String(255), nullable=False) profile_url: Mapped[str] = mapped_column(Text, nullable=False) full_name: Mapped[str | None] = mapped_column(Text) change_type: Mapped[str] = mapped_column(String(32), nullable=False) profile_available: Mapped[bool | None] = mapped_column() message: Mapped[str | None] = mapped_column(Text) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) crawl_run: Mapped[CrawlRun] = relationship(back_populates="employee_changes") employee: Mapped[Employee | None] = relationship(back_populates="crawl_run_changes") class CrawlError(Base): __tablename__ = "crawl_errors" __table_args__ = (Index("ix_crawl_errors_run_id", "crawl_run_id"),) id: Mapped[int] = mapped_column(Integer, primary_key=True) crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False) profile_url: Mapped[str | None] = mapped_column(Text) error_type: Mapped[str] = mapped_column(String(255), nullable=False) message: Mapped[str] = mapped_column(Text, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) class ProfileTab(Base): __tablename__ = "profile_tabs" __table_args__ = (Index("ix_profile_tabs_employee_id", "employee_id"),) id: Mapped[int] = mapped_column(Integer, primary_key=True) employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False) title: Mapped[str] = mapped_column(Text, nullable=False) href: Mapped[str] = mapped_column(Text, nullable=False) data_index: Mapped[str | None] = mapped_column(String(64)) employee: Mapped[Employee] = relationship(back_populates="tabs") class ParserSource(Base): __tablename__ = "parser_sources" __table_args__ = (UniqueConstraint("source_url", name="uq_parser_sources_source_url"),) id: Mapped[int] = mapped_column(Integer, primary_key=True) source_url: Mapped[str] = mapped_column(Text, nullable=False) enabled: Mapped[bool] = mapped_column(default=True, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) class ParseResourceCache(Base): __tablename__ = "parse_resource_cache" __table_args__ = ( UniqueConstraint("profile_key", "resource_key", "request_fingerprint", name="uq_parse_resource_cache_resource"), Index("ix_parse_resource_cache_profile_key", "profile_key"), ) id: Mapped[int] = mapped_column(Integer, primary_key=True) profile_key: Mapped[str] = mapped_column(String(255), nullable=False) resource_key: Mapped[str] = mapped_column(String(255), nullable=False) method: Mapped[str] = mapped_column(String(16), nullable=False) url: Mapped[str] = mapped_column(Text, nullable=False) request_fingerprint: Mapped[str] = mapped_column(String(64), nullable=False) etag: Mapped[str | None] = mapped_column(Text) last_modified: Mapped[str | None] = mapped_column(Text) body_hash: Mapped[str] = mapped_column(String(64), nullable=False) body_snapshot: Mapped[bytes] = mapped_column(LargeBinary, nullable=False) parser_version: Mapped[str | None] = mapped_column(String(32)) fetched_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) class DatasetVersion(Base): __tablename__ = "dataset_versions" __table_args__ = ( UniqueConstraint("hash", name="uq_dataset_versions_hash"), Index("ix_dataset_versions_created_at", "created_at"), ) id: Mapped[int] = mapped_column(Integer, primary_key=True) hash: Mapped[str] = mapped_column(String(64), nullable=False) previous_hash: Mapped[str | None] = mapped_column(String(64)) crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id")) employee_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) active_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) crawl_run: Mapped[CrawlRun | None] = relationship(back_populates="dataset_versions") items: Mapped[list["DatasetVersionItem"]] = relationship(back_populates="dataset_version", cascade="all, delete-orphan") class DatasetVersionItem(Base): __tablename__ = "dataset_version_items" __table_args__ = ( UniqueConstraint("dataset_version_id", "profile_key", name="uq_dataset_version_items_version_profile"), Index("ix_dataset_version_items_hash", "dataset_version_id"), Index("ix_dataset_version_items_profile_key", "profile_key"), ) id: Mapped[int] = mapped_column(Integer, primary_key=True) dataset_version_id: Mapped[int] = mapped_column(ForeignKey("dataset_versions.id"), nullable=False) profile_key: Mapped[str] = mapped_column(String(255), nullable=False) employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id")) status: Mapped[str] = mapped_column(String(32), nullable=False) checksum: Mapped[str] = mapped_column(String(64), nullable=False) dataset_version: Mapped[DatasetVersion] = relationship(back_populates="items") employee: Mapped[Employee | None] = relationship()