feat: add detailed employee publications storage and MCP docs

2026-05-15 17:39:41 +03:00
parent 2819a6c334
commit dbaf3af468
14 changed files with 677 additions and 26 deletions
--- a/app/parser/profile.py
+++ b/app/parser/profile.py
@@ -333,7 +333,7 @@ def _load_widget_publications(
        items = _extract_publication_items(result)
        if not items:
            break
-        publications.extend(_normalize_publication_item(item) for item in items)
+        publications.extend(_normalize_publication_item(item, author_id) for item in items)

        total = int(result.get("total") or 0)
        if not result.get("more") and len(publications) >= total:
@@ -575,20 +575,37 @@ def _parse_vkr_items(nodes: list) -> list[str]:
    return [item for item in dict.fromkeys(items) if item]


-def _normalize_publication_item(item: dict) -> dict:
+def _normalize_publication_item(item: dict, current_author_id: str | None = None) -> dict:
    publication_id = str(item.get("id") or "").strip()
    title = _html_to_text(item.get("title"))
-    year = item.get("year")
+    year = _int_or_none(item.get("year"))
    publication_type = str(item.get("type") or "").strip() or None
    description = item.get("description") if isinstance(item.get("description"), dict) else {}
    short_description = _localized_value(description.get("short")) or _localized_value(description.get("shortLeft"))
+    documents = item.get("documents") if isinstance(item.get("documents"), dict) else {}
+    language = item.get("language") if isinstance(item.get("language"), dict) else {}
+    annotation = _localized_text_map(item.get("annotation"))
+    authors = _normalize_publication_authors(item.get("authorsByType"), current_author_id)
+    citation_text = normalize_ws(str(description.get("main") or "")) or _build_publication_citation(title, authors, year)
    text = normalize_ws(" ".join(part for part in [title, str(year or ""), short_description] if part))
    return {
        "id": publication_id or None,
+        "publication_id": publication_id or None,
        "title": title or publication_id,
        "year": year,
        "type": publication_type,
+        "publication_type": publication_type,
+        "language": normalize_ws(language.get("name")) or None,
+        "status": _int_or_none(item.get("status")),
        "url": f"https://publications.hse.ru/view/{publication_id}" if publication_id else None,
+        "doi_url": _document_href(documents, "DOI"),
+        "other_url": _document_href(documents, "OTHER_URL"),
+        "document_url": _document_href(documents, "DOCUMENT"),
+        "citation_text": citation_text or None,
+        "annotation": annotation,
+        "description": description or None,
+        "authors": authors,
+        "raw_data": item,
        "text": text or title or publication_id,
    }

@@ -685,12 +702,69 @@ def _html_to_text(value: object) -> str:
    return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))


+def _localized_text_map(value: object) -> dict[str, str]:
+    if not isinstance(value, dict):
+        return {}
+    localized = {}
+    for key in ("ru", "en", "publ"):
+        text = _html_to_text(value.get(key))
+        if text:
+            localized[key] = text
+    return localized
+
+
 def _localized_value(value: object) -> str:
    if isinstance(value, dict):
        return normalize_ws(value.get("ru") or value.get("publ") or value.get("en"))
    return normalize_ws(str(value or ""))


+def _normalize_publication_authors(value: object, current_author_id: str | None) -> list[dict]:
+    if not isinstance(value, dict):
+        return []
+    authors = []
+    for author in value.get("author") or []:
+        if not isinstance(author, dict):
+            continue
+        title = author.get("title") if isinstance(author.get("title"), dict) else {}
+        reverse_title = author.get("reverseTitle") if isinstance(author.get("reverseTitle"), dict) else {}
+        author_id = normalize_ws(author.get("id"))
+        href = normalize_ws(author.get("href"))
+        authors.append(
+            {
+                "id": author_id or None,
+                "href": urljoin("https://www.hse.ru", href) if href else None,
+                "title_ru": _html_to_text(title.get("ru")),
+                "title_en": _html_to_text(title.get("en")),
+                "reverse_title_ru": _html_to_text(reverse_title.get("ru")),
+                "reverse_title_en": _html_to_text(reverse_title.get("en")),
+                "alt_name": normalize_ws(author.get("altName")) or None,
+                "other_name": normalize_ws(author.get("otherName")) or None,
+                "is_current_employee": bool(current_author_id and author_id == current_author_id),
+            }
+        )
+    return authors
+
+
+def _document_href(documents: dict, key: str) -> str | None:
+    document = documents.get(key)
+    if not isinstance(document, dict):
+        return None
+    return normalize_ws(document.get("href")) or None
+
+
+def _build_publication_citation(title: str, authors: list[dict], year: int | None) -> str:
+    author_names = [author.get("title_ru") or author.get("title_en") or author.get("alt_name") for author in authors]
+    return normalize_ws(". ".join(part for part in [", ".join(filter(None, author_names)), title, str(year or "")] if part))
+
+
+def _int_or_none(value: object) -> int | None:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
 def _slugify(value: str) -> str:
    cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
    return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"