feat: add detailed employee publications storage and MCP docs

This commit is contained in:
Anton
2026-05-15 17:39:41 +03:00
parent 2819a6c334
commit dbaf3af468
14 changed files with 677 additions and 26 deletions

View File

@@ -10,7 +10,7 @@ from sqlalchemy.pool import StaticPool
from app.config import Settings, get_settings
from app.db import Base, get_db
from app.main import app
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication
from app.security import SESSION_COOKIE, sign_session
@@ -20,7 +20,7 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.6.1"
assert response.json()["backend_version"] == "0.6.2"
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
@@ -154,13 +154,115 @@ def test_mcp_service_info_returns_tools_and_dataset_hash():
assert response.status_code == 200
payload = json.loads(response.json()["result"]["content"][0]["text"])
assert payload["service_name"] == "miem-employees"
assert payload["backend_version"] == "0.6.1"
assert payload["backend_version"] == "0.6.2"
assert payload["dataset"]["hash"]
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])
app.dependency_overrides.clear()
def test_mcp_list_employee_publications_prefers_stored_publications_with_fallback():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
stored_employee = Employee(
profile_key="staff:stored",
profile_type="staff",
profile_id="stored",
canonical_url="https://www.hse.ru/staff/stored",
full_name="Stored Person",
status="active",
current_data={
"sections": [
{
"type": "publications",
"publications": [{"title": "Old JSON Publication", "url": "https://example.test/old"}],
}
]
},
)
fallback_employee = Employee(
profile_key="staff:fallback",
profile_type="staff",
profile_id="fallback",
canonical_url="https://www.hse.ru/staff/fallback",
full_name="Fallback Person",
status="active",
current_data={
"sections": [
{
"type": "publications",
"publications": [{"title": "Fallback Publication", "url": "https://example.test/fallback"}],
}
]
},
)
session.add_all([stored_employee, fallback_employee])
session.commit()
session.add(
EmployeePublication(
employee_id=stored_employee.id,
publication_id="pub-1",
title="Stored Publication",
year=2024,
publication_type="ARTICLE",
url="https://publications.hse.ru/view/pub-1",
doi_url="https://doi.org/10.1/test",
citation_text="Stored Citation",
annotation={"ru": "Аннотация", "en": "Abstract"},
description={"main": "Stored Citation"},
authors=[{"id": "1", "title_ru": "Автор", "is_current_employee": True}],
source_hash="a" * 64,
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
client = TestClient(app)
stored_response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 1,
"method": "tools/call",
"params": {"name": "list_employee_publications", "arguments": {"profile_id_or_url": "stored"}},
},
)
fallback_response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 2,
"method": "tools/call",
"params": {"name": "list_employee_publications", "arguments": {"profile_id_or_url": "fallback"}},
},
)
stored_payload = json.loads(stored_response.json()["result"]["content"][0]["text"])
fallback_payload = json.loads(fallback_response.json()["result"]["content"][0]["text"])
assert stored_payload["items"][0]["title"] == "Stored Publication"
assert stored_payload["items"][0]["doi_url"] == "https://doi.org/10.1/test"
assert stored_payload["items"][0]["annotation"] == {"ru": "Аннотация", "en": "Abstract"}
assert stored_payload["items"][0]["authors"] == [{"id": "1", "title_ru": "Автор", "is_current_employee": True}]
assert fallback_payload["items"][0]["title"] == "Fallback Publication"
app.dependency_overrides.clear()
def test_mcp_sync_employees_full_empty_and_unknown_hash_modes():
engine = create_engine(
"sqlite:///:memory:",

View File

@@ -1,7 +1,7 @@
import gzip
from datetime import datetime, timezone
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParseResourceCache
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication, EmployeeSnapshot, ParseResourceCache
from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee
from app.services.resource_cache import ResourceCache
@@ -191,6 +191,68 @@ def test_upsert_employee_skips_snapshot_when_checksum_is_unchanged(db_session):
assert db_session.query(EmployeeSnapshot).count() == 1
def test_upsert_employee_saves_publications_and_reuses_existing_rows(db_session):
first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add_all([first_run, second_run])
db_session.commit()
parsed = _parsed_employee("published")
parsed["sections"] = [
{
"type": "publications",
"publications": [
{
"id": "888959076",
"publication_id": "888959076",
"title": "Detailed Publication",
"year": 2023,
"publication_type": "ARTICLE",
"language": "ru",
"status": 1,
"url": "https://publications.hse.ru/view/888959076",
"doi_url": "https://doi.org/10.1/test",
"citation_text": "Detailed citation",
"annotation": {"ru": "Аннотация"},
"description": {"main": "Detailed citation"},
"authors": [{"id": "1", "title_ru": "Автор"}],
"raw_data": {"id": "888959076", "title": "Detailed Publication"},
}
],
}
]
employee, _ = _upsert_employee(db_session, first_run, parsed)
db_session.commit()
_upsert_employee(db_session, second_run, _parsed_employee_with_publication("published"))
db_session.commit()
publications = db_session.query(EmployeePublication).filter_by(employee_id=employee.id).all()
assert len(publications) == 1
assert publications[0].doi_url == "https://doi.org/10.1/test"
assert publications[0].authors == [{"id": "1", "title_ru": "Автор"}]
def test_upsert_employee_records_publication_errors_without_failing_employee(monkeypatch, db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
def broken_sync(*_args, **_kwargs):
raise RuntimeError("boom")
monkeypatch.setattr("app.services.crawler._sync_employee_publications", broken_sync)
employee, changed = _upsert_employee(db_session, run, _parsed_employee_with_publication("error-safe"))
db_session.commit()
assert changed is True
assert employee.full_name == "Same Person"
assert db_session.query(Employee).filter_by(profile_key="staff:error-safe").one()
error = db_session.query(CrawlError).one()
assert "публикации" in error.message.lower()
def test_checksum_changes_when_widget_data_changes():
base = _parsed_employee("widgets")
changed = _parsed_employee("widgets")
@@ -224,3 +286,31 @@ def _parsed_employee(profile_id: str) -> dict:
"parser_version": "0.6.0",
"_html": "<html></html>",
}
def _parsed_employee_with_publication(profile_id: str) -> dict:
parsed = _parsed_employee(profile_id)
parsed["sections"] = [
{
"type": "publications",
"publications": [
{
"id": "888959076",
"publication_id": "888959076",
"title": "Detailed Publication",
"year": 2023,
"publication_type": "ARTICLE",
"language": "ru",
"status": 1,
"url": "https://publications.hse.ru/view/888959076",
"doi_url": "https://doi.org/10.1/test",
"citation_text": "Detailed citation",
"annotation": {"ru": "Аннотация"},
"description": {"main": "Detailed citation"},
"authors": [{"id": "1", "title_ru": "Автор"}],
"raw_data": {"id": "888959076", "title": "Detailed Publication"},
}
],
}
]
return parsed

View File

@@ -25,3 +25,47 @@ def test_runtime_schema_adds_skipped_count_to_existing_crawl_runs_table(monkeypa
columns = {column["name"] for column in inspect(engine).get_columns("crawl_runs")}
assert "skipped_count" in columns
def test_runtime_schema_creates_employee_publications_table_when_employees_exist(monkeypatch):
engine = create_engine("sqlite:///:memory:")
with engine.begin() as connection:
connection.execute(
text(
"""
CREATE TABLE employees (
id INTEGER PRIMARY KEY,
profile_key VARCHAR(255) NOT NULL UNIQUE,
canonical_url TEXT NOT NULL,
status VARCHAR(32) NOT NULL DEFAULT 'active',
first_seen_at DATETIME NOT NULL,
last_seen_at DATETIME NOT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
)
"""
)
)
connection.execute(
text(
"""
CREATE TABLE crawl_runs (
id INTEGER PRIMARY KEY,
source_url TEXT NOT NULL,
status VARCHAR(32) NOT NULL DEFAULT 'running',
found_count INTEGER NOT NULL DEFAULT 0,
parsed_count INTEGER NOT NULL DEFAULT 0,
skipped_count INTEGER NOT NULL DEFAULT 0
)
"""
)
)
monkeypatch.setattr("app.db.engine", engine)
_ensure_runtime_schema()
_ensure_runtime_schema()
inspector = inspect(engine)
assert "employee_publications" in inspector.get_table_names()
columns = {column["name"] for column in inspector.get_columns("employee_publications")}
assert {"employee_id", "publication_id", "doi_url", "authors", "raw_data", "source_hash"}.issubset(columns)

View File

@@ -34,7 +34,21 @@ class FakeSession:
"type": "ARTICLE",
"title": "Дублирование пакетов",
"year": 2023,
"language": {"name": "ru"},
"status": 1,
"authorsByType": {
"author": [
{
"id": "568398853",
"href": "/org/persons/568398853",
"title": {"ru": "Левицкий И. А.", "en": ""},
"reverseTitle": {"ru": "И. А. Левицкий", "en": ""},
}
]
},
"description": {"short": {"ru": "Информационные процессы. 2023."}},
"annotation": {"ru": "<p>Русская аннотация</p>"},
"documents": {"DOI": {"href": "https://doi.org/10.1/test"}},
}
],
},
@@ -153,6 +167,9 @@ def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
assert publications["publications_count"] == 1
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
assert publications["publications"][0]["doi_url"] == "https://doi.org/10.1/test"
assert publications["publications"][0]["annotation"] == {"ru": "Русская аннотация"}
assert publications["publications"][0]["authors"][0]["is_current_employee"] is True
assert theses["theses_count"] == 1
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"