236 lines
12 KiB
Python
236 lines
12 KiB
Python
from datetime import datetime, timezone
|
|
|
|
from sqlalchemy import DateTime, ForeignKey, Index, Integer, LargeBinary, String, Text, UniqueConstraint
|
|
from sqlalchemy.dialects.postgresql import JSONB
|
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
from sqlalchemy.types import JSON
|
|
|
|
from app.db import Base
|
|
|
|
|
|
def utcnow() -> datetime:
|
|
return datetime.now(timezone.utc)
|
|
|
|
|
|
json_type = JSON().with_variant(JSONB, "postgresql")
|
|
|
|
|
|
class Employee(Base):
|
|
__tablename__ = "employees"
|
|
__table_args__ = (
|
|
UniqueConstraint("profile_key", name="uq_employees_profile_key"),
|
|
Index("ix_employees_full_name", "full_name"),
|
|
Index("ix_employees_status", "status"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
profile_type: Mapped[str | None] = mapped_column(String(50))
|
|
profile_id: Mapped[str | None] = mapped_column(String(255))
|
|
canonical_url: Mapped[str] = mapped_column(Text, nullable=False)
|
|
full_name: Mapped[str | None] = mapped_column(Text)
|
|
status: Mapped[str] = mapped_column(String(32), default="active", nullable=False)
|
|
first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
last_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
dismissed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
|
parser_version: Mapped[str | None] = mapped_column(String(32))
|
|
current_data: Mapped[dict | None] = mapped_column(json_type)
|
|
current_checksum: Mapped[str | None] = mapped_column(String(64))
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False)
|
|
|
|
snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
|
|
tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
|
|
publications: Mapped[list["EmployeePublication"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
|
|
crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee")
|
|
|
|
|
|
class EmployeeSnapshot(Base):
|
|
__tablename__ = "employee_snapshots"
|
|
__table_args__ = (Index("ix_employee_snapshots_employee_id", "employee_id"),)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False)
|
|
crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id"))
|
|
parsed_data: Mapped[dict] = mapped_column(json_type, nullable=False)
|
|
html_snapshot: Mapped[bytes | None] = mapped_column(LargeBinary)
|
|
checksum: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
parser_version: Mapped[str | None] = mapped_column(String(32))
|
|
captured_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
|
|
employee: Mapped[Employee] = relationship(back_populates="snapshots")
|
|
|
|
|
|
class EmployeePublication(Base):
|
|
__tablename__ = "employee_publications"
|
|
__table_args__ = (
|
|
UniqueConstraint("employee_id", "publication_id", name="uq_employee_publications_employee_publication"),
|
|
UniqueConstraint("employee_id", "source_hash", name="uq_employee_publications_employee_source_hash"),
|
|
Index("ix_employee_publications_employee_id", "employee_id"),
|
|
Index("ix_employee_publications_publication_id", "publication_id"),
|
|
Index("ix_employee_publications_doi_url", "doi_url"),
|
|
Index("ix_employee_publications_year", "year"),
|
|
Index("ix_employee_publications_publication_type", "publication_type"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id", ondelete="CASCADE"), nullable=False)
|
|
publication_id: Mapped[str | None] = mapped_column(String(64))
|
|
title: Mapped[str] = mapped_column(Text, nullable=False)
|
|
year: Mapped[int | None] = mapped_column(Integer)
|
|
publication_type: Mapped[str | None] = mapped_column(String(64))
|
|
language: Mapped[str | None] = mapped_column(String(16))
|
|
status: Mapped[int | None] = mapped_column(Integer)
|
|
url: Mapped[str | None] = mapped_column(Text)
|
|
doi_url: Mapped[str | None] = mapped_column(Text)
|
|
other_url: Mapped[str | None] = mapped_column(Text)
|
|
document_url: Mapped[str | None] = mapped_column(Text)
|
|
citation_text: Mapped[str | None] = mapped_column(Text)
|
|
annotation: Mapped[dict | None] = mapped_column(json_type)
|
|
description: Mapped[dict | None] = mapped_column(json_type)
|
|
authors: Mapped[list | None] = mapped_column(json_type)
|
|
raw_data: Mapped[dict | None] = mapped_column(json_type)
|
|
source_hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False)
|
|
|
|
employee: Mapped[Employee] = relationship(back_populates="publications")
|
|
|
|
|
|
class CrawlRun(Base):
|
|
__tablename__ = "crawl_runs"
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
source_url: Mapped[str] = mapped_column(Text, nullable=False)
|
|
status: Mapped[str] = mapped_column(String(32), default="running", nullable=False)
|
|
started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
|
found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
skipped_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
message: Mapped[str | None] = mapped_column(Text)
|
|
|
|
employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run")
|
|
dataset_versions: Mapped[list["DatasetVersion"]] = relationship(back_populates="crawl_run")
|
|
|
|
|
|
class CrawlRunEmployeeChange(Base):
|
|
__tablename__ = "crawl_run_employee_changes"
|
|
__table_args__ = (
|
|
Index("ix_crawl_run_employee_changes_run_id", "crawl_run_id"),
|
|
Index("ix_crawl_run_employee_changes_employee_id", "employee_id"),
|
|
Index("ix_crawl_run_employee_changes_change_type", "change_type"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
|
|
employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
|
|
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
profile_url: Mapped[str] = mapped_column(Text, nullable=False)
|
|
full_name: Mapped[str | None] = mapped_column(Text)
|
|
change_type: Mapped[str] = mapped_column(String(32), nullable=False)
|
|
profile_available: Mapped[bool | None] = mapped_column()
|
|
message: Mapped[str | None] = mapped_column(Text)
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
|
|
crawl_run: Mapped[CrawlRun] = relationship(back_populates="employee_changes")
|
|
employee: Mapped[Employee | None] = relationship(back_populates="crawl_run_changes")
|
|
|
|
|
|
class CrawlError(Base):
|
|
__tablename__ = "crawl_errors"
|
|
__table_args__ = (Index("ix_crawl_errors_run_id", "crawl_run_id"),)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
|
|
profile_url: Mapped[str | None] = mapped_column(Text)
|
|
error_type: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
message: Mapped[str] = mapped_column(Text, nullable=False)
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
|
|
|
|
class ProfileTab(Base):
|
|
__tablename__ = "profile_tabs"
|
|
__table_args__ = (Index("ix_profile_tabs_employee_id", "employee_id"),)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False)
|
|
title: Mapped[str] = mapped_column(Text, nullable=False)
|
|
href: Mapped[str] = mapped_column(Text, nullable=False)
|
|
data_index: Mapped[str | None] = mapped_column(String(64))
|
|
|
|
employee: Mapped[Employee] = relationship(back_populates="tabs")
|
|
|
|
|
|
class ParserSource(Base):
|
|
__tablename__ = "parser_sources"
|
|
__table_args__ = (UniqueConstraint("source_url", name="uq_parser_sources_source_url"),)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
source_url: Mapped[str] = mapped_column(Text, nullable=False)
|
|
enabled: Mapped[bool] = mapped_column(default=True, nullable=False)
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
|
|
|
|
class ParseResourceCache(Base):
|
|
__tablename__ = "parse_resource_cache"
|
|
__table_args__ = (
|
|
UniqueConstraint("profile_key", "resource_key", "request_fingerprint", name="uq_parse_resource_cache_resource"),
|
|
Index("ix_parse_resource_cache_profile_key", "profile_key"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
resource_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
method: Mapped[str] = mapped_column(String(16), nullable=False)
|
|
url: Mapped[str] = mapped_column(Text, nullable=False)
|
|
request_fingerprint: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
etag: Mapped[str | None] = mapped_column(Text)
|
|
last_modified: Mapped[str | None] = mapped_column(Text)
|
|
body_hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
body_snapshot: Mapped[bytes] = mapped_column(LargeBinary, nullable=False)
|
|
parser_version: Mapped[str | None] = mapped_column(String(32))
|
|
fetched_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
|
|
|
|
class DatasetVersion(Base):
|
|
__tablename__ = "dataset_versions"
|
|
__table_args__ = (
|
|
UniqueConstraint("hash", name="uq_dataset_versions_hash"),
|
|
Index("ix_dataset_versions_created_at", "created_at"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
previous_hash: Mapped[str | None] = mapped_column(String(64))
|
|
crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id"))
|
|
employee_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
active_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
|
|
|
crawl_run: Mapped[CrawlRun | None] = relationship(back_populates="dataset_versions")
|
|
items: Mapped[list["DatasetVersionItem"]] = relationship(back_populates="dataset_version", cascade="all, delete-orphan")
|
|
|
|
|
|
class DatasetVersionItem(Base):
|
|
__tablename__ = "dataset_version_items"
|
|
__table_args__ = (
|
|
UniqueConstraint("dataset_version_id", "profile_key", name="uq_dataset_version_items_version_profile"),
|
|
Index("ix_dataset_version_items_hash", "dataset_version_id"),
|
|
Index("ix_dataset_version_items_profile_key", "profile_key"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
dataset_version_id: Mapped[int] = mapped_column(ForeignKey("dataset_versions.id"), nullable=False)
|
|
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
|
|
status: Mapped[str] = mapped_column(String(32), nullable=False)
|
|
checksum: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
|
|
dataset_version: Mapped[DatasetVersion] = relationship(back_populates="items")
|
|
employee: Mapped[Employee | None] = relationship()
|