feat: add detailed employee publications storage and MCP docs

This commit is contained in:
Anton
2026-05-15 17:39:41 +03:00
parent 2819a6c334
commit dbaf3af468
14 changed files with 677 additions and 26 deletions

View File

@@ -1,7 +1,7 @@
import gzip
from datetime import datetime, timezone
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParseResourceCache
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication, EmployeeSnapshot, ParseResourceCache
from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee
from app.services.resource_cache import ResourceCache
@@ -191,6 +191,68 @@ def test_upsert_employee_skips_snapshot_when_checksum_is_unchanged(db_session):
assert db_session.query(EmployeeSnapshot).count() == 1
def test_upsert_employee_saves_publications_and_reuses_existing_rows(db_session):
first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add_all([first_run, second_run])
db_session.commit()
parsed = _parsed_employee("published")
parsed["sections"] = [
{
"type": "publications",
"publications": [
{
"id": "888959076",
"publication_id": "888959076",
"title": "Detailed Publication",
"year": 2023,
"publication_type": "ARTICLE",
"language": "ru",
"status": 1,
"url": "https://publications.hse.ru/view/888959076",
"doi_url": "https://doi.org/10.1/test",
"citation_text": "Detailed citation",
"annotation": {"ru": "Аннотация"},
"description": {"main": "Detailed citation"},
"authors": [{"id": "1", "title_ru": "Автор"}],
"raw_data": {"id": "888959076", "title": "Detailed Publication"},
}
],
}
]
employee, _ = _upsert_employee(db_session, first_run, parsed)
db_session.commit()
_upsert_employee(db_session, second_run, _parsed_employee_with_publication("published"))
db_session.commit()
publications = db_session.query(EmployeePublication).filter_by(employee_id=employee.id).all()
assert len(publications) == 1
assert publications[0].doi_url == "https://doi.org/10.1/test"
assert publications[0].authors == [{"id": "1", "title_ru": "Автор"}]
def test_upsert_employee_records_publication_errors_without_failing_employee(monkeypatch, db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
def broken_sync(*_args, **_kwargs):
raise RuntimeError("boom")
monkeypatch.setattr("app.services.crawler._sync_employee_publications", broken_sync)
employee, changed = _upsert_employee(db_session, run, _parsed_employee_with_publication("error-safe"))
db_session.commit()
assert changed is True
assert employee.full_name == "Same Person"
assert db_session.query(Employee).filter_by(profile_key="staff:error-safe").one()
error = db_session.query(CrawlError).one()
assert "публикации" in error.message.lower()
def test_checksum_changes_when_widget_data_changes():
base = _parsed_employee("widgets")
changed = _parsed_employee("widgets")
@@ -224,3 +286,31 @@ def _parsed_employee(profile_id: str) -> dict:
"parser_version": "0.6.0",
"_html": "<html></html>",
}
def _parsed_employee_with_publication(profile_id: str) -> dict:
parsed = _parsed_employee(profile_id)
parsed["sections"] = [
{
"type": "publications",
"publications": [
{
"id": "888959076",
"publication_id": "888959076",
"title": "Detailed Publication",
"year": 2023,
"publication_type": "ARTICLE",
"language": "ru",
"status": 1,
"url": "https://publications.hse.ru/view/888959076",
"doi_url": "https://doi.org/10.1/test",
"citation_text": "Detailed citation",
"annotation": {"ru": "Аннотация"},
"description": {"main": "Detailed citation"},
"authors": [{"id": "1", "title_ru": "Автор"}],
"raw_data": {"id": "888959076", "title": "Detailed Publication"},
}
],
}
]
return parsed