feat: add detailed employee publications storage and MCP docs

2026-05-15 17:39:41 +03:00
parent 2819a6c334
commit dbaf3af468
14 changed files with 677 additions and 26 deletions
--- a/app/db.py
+++ b/app/db.py
@@ -29,8 +29,15 @@ def init_db() -> None:


 def _ensure_runtime_schema() -> None:
+    import app.models as models
+
    inspector = inspect(engine)
-    if "crawl_runs" not in inspector.get_table_names():
+    table_names = set(inspector.get_table_names())
+    if "employees" in table_names and "employee_publications" not in table_names:
+        models.EmployeePublication.__table__.create(bind=engine, checkfirst=True)
+        inspector = inspect(engine)
+        table_names = set(inspector.get_table_names())
+    if "crawl_runs" not in table_names:
        return
    crawl_run_columns = {column["name"] for column in inspector.get_columns("crawl_runs")}
    if "skipped_count" not in crawl_run_columns:
--- a/app/mcp.py
+++ b/app/mcp.py
@@ -5,7 +5,7 @@ from sqlalchemy import desc, or_, select
 from sqlalchemy.orm import Session

 from app.db import get_db
-from app.models import CrawlRun, Employee
+from app.models import CrawlRun, Employee, EmployeePublication
 from app.services.admin_data import run_detail_payload
 from app.services.dataset_versions import service_info_payload, sync_employees_payload
 from app.version import BACKEND_VERSION
@@ -52,7 +52,10 @@ TOOLS = [
    },
    {
        "name": "list_employee_publications",
-        "description": "List publications parsed from an employee profile.",
+        "description": (
+            "List employee publications with detailed fields when available: authors, DOI URL, annotation, "
+            "description, citation text, year, publication type, language, status, and HSE Publications URL."
+        ),
        "inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
    },
    {
@@ -171,8 +174,14 @@ def _find_employee(db: Session, value: str) -> Employee | None:


 def _collect_section_items(employee: Employee | None, section_type: str) -> dict:
-    if not employee or not employee.current_data:
+    if not employee:
        return {"items": []}
+    if section_type == "publications":
+        publications = _stored_publications(employee)
+        if publications:
+            return {"employee": _employee_payload(employee, include_data=False), "items": publications}
+    if not employee.current_data:
+        return {"employee": _employee_payload(employee, include_data=False), "items": []}
    items = []
    for section in employee.current_data.get("sections") or []:
        if section.get("type") != section_type:
@@ -184,6 +193,41 @@ def _collect_section_items(employee: Employee | None, section_type: str) -> dict
    return {"employee": _employee_payload(employee, include_data=False), "items": items}


+def _stored_publications(employee: Employee) -> list[dict]:
+    return [_publication_payload(publication) for publication in sorted(employee.publications, key=_publication_sort_key)]
+
+
+def _publication_sort_key(publication: EmployeePublication) -> tuple:
+    return (publication.year or 0, publication.title or "", publication.id)
+
+
+def _publication_payload(publication: EmployeePublication) -> dict:
+    text = publication.citation_text or publication.title
+    payload = {
+        "id": publication.publication_id,
+        "publication_id": publication.publication_id,
+        "title": publication.title,
+        "text": text,
+        "url": publication.url,
+    }
+    optional = {
+        "year": publication.year,
+        "type": publication.publication_type,
+        "publication_type": publication.publication_type,
+        "language": publication.language,
+        "status": publication.status,
+        "doi_url": publication.doi_url,
+        "other_url": publication.other_url,
+        "document_url": publication.document_url,
+        "citation_text": publication.citation_text,
+        "annotation": publication.annotation,
+        "description": publication.description,
+        "authors": publication.authors,
+    }
+    payload.update({key: value for key, value in optional.items() if value not in (None, [], {})})
+    return payload
+
+
 def _employee_payload(employee: Employee, include_data: bool = True) -> dict:
    payload = {
        "profile_key": employee.profile_key,
--- a/app/models.py
+++ b/app/models.py
@@ -41,6 +41,7 @@ class Employee(Base):

    snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
    tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
+    publications: Mapped[list["EmployeePublication"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
    crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee")


@@ -60,6 +61,42 @@ class EmployeeSnapshot(Base):
    employee: Mapped[Employee] = relationship(back_populates="snapshots")


+class EmployeePublication(Base):
+    __tablename__ = "employee_publications"
+    __table_args__ = (
+        UniqueConstraint("employee_id", "publication_id", name="uq_employee_publications_employee_publication"),
+        UniqueConstraint("employee_id", "source_hash", name="uq_employee_publications_employee_source_hash"),
+        Index("ix_employee_publications_employee_id", "employee_id"),
+        Index("ix_employee_publications_publication_id", "publication_id"),
+        Index("ix_employee_publications_doi_url", "doi_url"),
+        Index("ix_employee_publications_year", "year"),
+        Index("ix_employee_publications_publication_type", "publication_type"),
+    )
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True)
+    employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id", ondelete="CASCADE"), nullable=False)
+    publication_id: Mapped[str | None] = mapped_column(String(64))
+    title: Mapped[str] = mapped_column(Text, nullable=False)
+    year: Mapped[int | None] = mapped_column(Integer)
+    publication_type: Mapped[str | None] = mapped_column(String(64))
+    language: Mapped[str | None] = mapped_column(String(16))
+    status: Mapped[int | None] = mapped_column(Integer)
+    url: Mapped[str | None] = mapped_column(Text)
+    doi_url: Mapped[str | None] = mapped_column(Text)
+    other_url: Mapped[str | None] = mapped_column(Text)
+    document_url: Mapped[str | None] = mapped_column(Text)
+    citation_text: Mapped[str | None] = mapped_column(Text)
+    annotation: Mapped[dict | None] = mapped_column(json_type)
+    description: Mapped[dict | None] = mapped_column(json_type)
+    authors: Mapped[list | None] = mapped_column(json_type)
+    raw_data: Mapped[dict | None] = mapped_column(json_type)
+    source_hash: Mapped[str] = mapped_column(String(64), nullable=False)
+    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
+    updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False)
+
+    employee: Mapped[Employee] = relationship(back_populates="publications")
+
+
 class CrawlRun(Base):
    __tablename__ = "crawl_runs"

--- a/app/parser/profile.py
+++ b/app/parser/profile.py
@@ -333,7 +333,7 @@ def _load_widget_publications(
        items = _extract_publication_items(result)
        if not items:
            break
-        publications.extend(_normalize_publication_item(item) for item in items)
+        publications.extend(_normalize_publication_item(item, author_id) for item in items)

        total = int(result.get("total") or 0)
        if not result.get("more") and len(publications) >= total:
@@ -575,20 +575,37 @@ def _parse_vkr_items(nodes: list) -> list[str]:
    return [item for item in dict.fromkeys(items) if item]


-def _normalize_publication_item(item: dict) -> dict:
+def _normalize_publication_item(item: dict, current_author_id: str | None = None) -> dict:
    publication_id = str(item.get("id") or "").strip()
    title = _html_to_text(item.get("title"))
-    year = item.get("year")
+    year = _int_or_none(item.get("year"))
    publication_type = str(item.get("type") or "").strip() or None
    description = item.get("description") if isinstance(item.get("description"), dict) else {}
    short_description = _localized_value(description.get("short")) or _localized_value(description.get("shortLeft"))
+    documents = item.get("documents") if isinstance(item.get("documents"), dict) else {}
+    language = item.get("language") if isinstance(item.get("language"), dict) else {}
+    annotation = _localized_text_map(item.get("annotation"))
+    authors = _normalize_publication_authors(item.get("authorsByType"), current_author_id)
+    citation_text = normalize_ws(str(description.get("main") or "")) or _build_publication_citation(title, authors, year)
    text = normalize_ws(" ".join(part for part in [title, str(year or ""), short_description] if part))
    return {
        "id": publication_id or None,
+        "publication_id": publication_id or None,
        "title": title or publication_id,
        "year": year,
        "type": publication_type,
+        "publication_type": publication_type,
+        "language": normalize_ws(language.get("name")) or None,
+        "status": _int_or_none(item.get("status")),
        "url": f"https://publications.hse.ru/view/{publication_id}" if publication_id else None,
+        "doi_url": _document_href(documents, "DOI"),
+        "other_url": _document_href(documents, "OTHER_URL"),
+        "document_url": _document_href(documents, "DOCUMENT"),
+        "citation_text": citation_text or None,
+        "annotation": annotation,
+        "description": description or None,
+        "authors": authors,
+        "raw_data": item,
        "text": text or title or publication_id,
    }

@@ -685,12 +702,69 @@ def _html_to_text(value: object) -> str:
    return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))


+def _localized_text_map(value: object) -> dict[str, str]:
+    if not isinstance(value, dict):
+        return {}
+    localized = {}
+    for key in ("ru", "en", "publ"):
+        text = _html_to_text(value.get(key))
+        if text:
+            localized[key] = text
+    return localized
+
+
 def _localized_value(value: object) -> str:
    if isinstance(value, dict):
        return normalize_ws(value.get("ru") or value.get("publ") or value.get("en"))
    return normalize_ws(str(value or ""))


+def _normalize_publication_authors(value: object, current_author_id: str | None) -> list[dict]:
+    if not isinstance(value, dict):
+        return []
+    authors = []
+    for author in value.get("author") or []:
+        if not isinstance(author, dict):
+            continue
+        title = author.get("title") if isinstance(author.get("title"), dict) else {}
+        reverse_title = author.get("reverseTitle") if isinstance(author.get("reverseTitle"), dict) else {}
+        author_id = normalize_ws(author.get("id"))
+        href = normalize_ws(author.get("href"))
+        authors.append(
+            {
+                "id": author_id or None,
+                "href": urljoin("https://www.hse.ru", href) if href else None,
+                "title_ru": _html_to_text(title.get("ru")),
+                "title_en": _html_to_text(title.get("en")),
+                "reverse_title_ru": _html_to_text(reverse_title.get("ru")),
+                "reverse_title_en": _html_to_text(reverse_title.get("en")),
+                "alt_name": normalize_ws(author.get("altName")) or None,
+                "other_name": normalize_ws(author.get("otherName")) or None,
+                "is_current_employee": bool(current_author_id and author_id == current_author_id),
+            }
+        )
+    return authors
+
+
+def _document_href(documents: dict, key: str) -> str | None:
+    document = documents.get(key)
+    if not isinstance(document, dict):
+        return None
+    return normalize_ws(document.get("href")) or None
+
+
+def _build_publication_citation(title: str, authors: list[dict], year: int | None) -> str:
+    author_names = [author.get("title_ru") or author.get("title_en") or author.get("alt_name") for author in authors]
+    return normalize_ws(". ".join(part for part in [", ".join(filter(None, author_names)), title, str(year or "")] if part))
+
+
+def _int_or_none(value: object) -> int | None:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
 def _slugify(value: str) -> str:
    cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
    return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"
--- a/app/services/crawler.py
+++ b/app/services/crawler.py
@@ -6,11 +6,20 @@ import time
 from datetime import datetime, timezone

 import requests
-from sqlalchemy import select
+from sqlalchemy import inspect, select
 from sqlalchemy.orm import Session

 from app.config import Settings
-from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab
+from app.models import (
+    CrawlError,
+    CrawlRun,
+    CrawlRunEmployeeChange,
+    Employee,
+    EmployeePublication,
+    EmployeeSnapshot,
+    ParserSource,
+    ProfileTab,
+)
 from app.parser.collector import collect_profile_links
 from app.parser.profile import parse_person_profile
 from app.parser.profile_url import profile_key
@@ -219,9 +228,127 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> tuple[Employee
                parser_version=parser_version,
            )
        )
+    db.flush()
+    _try_sync_employee_publications(db, run, employee, parsed)
    return employee, changed


+def _try_sync_employee_publications(db: Session, run: CrawlRun, employee: Employee, parsed: dict) -> None:
+    try:
+        if not _publication_payloads(parsed):
+            return
+        if not _employee_publications_table_exists(db):
+            return
+        with db.begin_nested():
+            _sync_employee_publications(db, employee, parsed)
+    except Exception as exc:
+        db.add(
+            CrawlError(
+                crawl_run_id=run.id,
+                profile_url=employee.canonical_url,
+                error_type=type(exc).__name__,
+                message=f"Не удалось сохранить публикации сотрудника: {exc}",
+            )
+        )
+
+
+def _employee_publications_table_exists(db: Session) -> bool:
+    return inspect(db.connection()).has_table(EmployeePublication.__tablename__)
+
+
+def _sync_employee_publications(db: Session, employee: Employee, parsed: dict) -> None:
+    publications = _publication_payloads(parsed)
+    seen_hashes = set()
+    for publication in publications:
+        source_hash = _publication_hash(publication)
+        seen_hashes.add(source_hash)
+        publication_id = _clean_optional(publication.get("publication_id") or publication.get("id"))
+        existing = None
+        if publication_id:
+            existing = db.scalar(
+                select(EmployeePublication).where(
+                    EmployeePublication.employee_id == employee.id,
+                    EmployeePublication.publication_id == publication_id,
+                )
+            )
+        if not existing:
+            existing = db.scalar(
+                select(EmployeePublication).where(
+                    EmployeePublication.employee_id == employee.id,
+                    EmployeePublication.source_hash == source_hash,
+                )
+            )
+        if not existing:
+            existing = EmployeePublication(employee_id=employee.id, source_hash=source_hash, title=_publication_title(publication))
+            db.add(existing)
+        _apply_publication(existing, publication, source_hash)
+
+    if seen_hashes:
+        stale = db.scalars(
+            select(EmployeePublication).where(
+                EmployeePublication.employee_id == employee.id,
+                EmployeePublication.source_hash.not_in(seen_hashes),
+            )
+        ).all()
+        for item in stale:
+            db.delete(item)
+
+
+def _publication_payloads(parsed: dict) -> list[dict]:
+    publications = []
+    for section in parsed.get("sections") or []:
+        if not isinstance(section, dict) or section.get("type") != "publications":
+            continue
+        for publication in section.get("publications") or []:
+            if isinstance(publication, dict):
+                publications.append(publication)
+    return publications
+
+
+def _apply_publication(target: EmployeePublication, publication: dict, source_hash: str) -> None:
+    target.publication_id = _clean_optional(publication.get("publication_id") or publication.get("id"))
+    target.title = _publication_title(publication)
+    target.year = _int_or_none(publication.get("year"))
+    target.publication_type = _clean_optional(publication.get("publication_type") or publication.get("type"))
+    target.language = _clean_optional(publication.get("language"))
+    target.status = _int_or_none(publication.get("status"))
+    target.url = _clean_optional(publication.get("url"))
+    target.doi_url = _clean_optional(publication.get("doi_url"))
+    target.other_url = _clean_optional(publication.get("other_url"))
+    target.document_url = _clean_optional(publication.get("document_url"))
+    target.citation_text = _clean_optional(publication.get("citation_text") or publication.get("text"))
+    target.annotation = publication.get("annotation") if isinstance(publication.get("annotation"), dict) else None
+    target.description = publication.get("description") if isinstance(publication.get("description"), dict) else None
+    target.authors = publication.get("authors") if isinstance(publication.get("authors"), list) else None
+    target.raw_data = publication.get("raw_data") if isinstance(publication.get("raw_data"), dict) else publication
+    target.source_hash = source_hash
+
+
+def _publication_hash(publication: dict) -> str:
+    return _payload_hash(publication.get("raw_data") if isinstance(publication.get("raw_data"), dict) else publication)
+
+
+def _payload_hash(value: object) -> str:
+    payload = json.dumps(_stable_checksum_payload(value), ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def _publication_title(publication: dict) -> str:
+    return _clean_optional(publication.get("title") or publication.get("text") or publication.get("id")) or "Untitled publication"
+
+
+def _clean_optional(value: object) -> str | None:
+    text = str(value or "").strip()
+    return text or None
+
+
+def _int_or_none(value: object) -> int | None:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
 def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
    dismissed = 0
    active = db.scalars(select(Employee).where(Employee.status == "active")).all()
--- a/app/version.py
+++ b/app/version.py
@@ -1,3 +1,3 @@
-APP_VERSION = "0.6.1"
-FRONTEND_VERSION = "0.6.1"
-BACKEND_VERSION = "0.6.1"
+APP_VERSION = "0.6.2"
+FRONTEND_VERSION = "0.6.2"
+BACKEND_VERSION = "0.6.2"