feat: add detailed employee publications storage and MCP docs
This commit is contained in:
@@ -10,7 +10,7 @@ from sqlalchemy.pool import StaticPool
|
||||
from app.config import Settings, get_settings
|
||||
from app.db import Base, get_db
|
||||
from app.main import app
|
||||
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
|
||||
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication
|
||||
from app.security import SESSION_COOKIE, sign_session
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ def test_health_returns_versions():
|
||||
response = client.get("/api/health")
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()["backend_version"] == "0.6.1"
|
||||
assert response.json()["backend_version"] == "0.6.2"
|
||||
|
||||
|
||||
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
|
||||
@@ -154,13 +154,115 @@ def test_mcp_service_info_returns_tools_and_dataset_hash():
|
||||
assert response.status_code == 200
|
||||
payload = json.loads(response.json()["result"]["content"][0]["text"])
|
||||
assert payload["service_name"] == "miem-employees"
|
||||
assert payload["backend_version"] == "0.6.1"
|
||||
assert payload["backend_version"] == "0.6.2"
|
||||
assert payload["dataset"]["hash"]
|
||||
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])
|
||||
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
def test_mcp_list_employee_publications_prefers_stored_publications_with_fallback():
|
||||
engine = create_engine(
|
||||
"sqlite:///:memory:",
|
||||
connect_args={"check_same_thread": False},
|
||||
poolclass=StaticPool,
|
||||
)
|
||||
Base.metadata.create_all(engine)
|
||||
Session = sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
stored_employee = Employee(
|
||||
profile_key="staff:stored",
|
||||
profile_type="staff",
|
||||
profile_id="stored",
|
||||
canonical_url="https://www.hse.ru/staff/stored",
|
||||
full_name="Stored Person",
|
||||
status="active",
|
||||
current_data={
|
||||
"sections": [
|
||||
{
|
||||
"type": "publications",
|
||||
"publications": [{"title": "Old JSON Publication", "url": "https://example.test/old"}],
|
||||
}
|
||||
]
|
||||
},
|
||||
)
|
||||
fallback_employee = Employee(
|
||||
profile_key="staff:fallback",
|
||||
profile_type="staff",
|
||||
profile_id="fallback",
|
||||
canonical_url="https://www.hse.ru/staff/fallback",
|
||||
full_name="Fallback Person",
|
||||
status="active",
|
||||
current_data={
|
||||
"sections": [
|
||||
{
|
||||
"type": "publications",
|
||||
"publications": [{"title": "Fallback Publication", "url": "https://example.test/fallback"}],
|
||||
}
|
||||
]
|
||||
},
|
||||
)
|
||||
session.add_all([stored_employee, fallback_employee])
|
||||
session.commit()
|
||||
session.add(
|
||||
EmployeePublication(
|
||||
employee_id=stored_employee.id,
|
||||
publication_id="pub-1",
|
||||
title="Stored Publication",
|
||||
year=2024,
|
||||
publication_type="ARTICLE",
|
||||
url="https://publications.hse.ru/view/pub-1",
|
||||
doi_url="https://doi.org/10.1/test",
|
||||
citation_text="Stored Citation",
|
||||
annotation={"ru": "Аннотация", "en": "Abstract"},
|
||||
description={"main": "Stored Citation"},
|
||||
authors=[{"id": "1", "title_ru": "Автор", "is_current_employee": True}],
|
||||
source_hash="a" * 64,
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
session.close()
|
||||
|
||||
def override_db():
|
||||
db = Session()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
app.dependency_overrides[get_db] = override_db
|
||||
client = TestClient(app)
|
||||
|
||||
stored_response = client.post(
|
||||
"/mcp",
|
||||
json={
|
||||
"jsonrpc": "2.0",
|
||||
"id": 1,
|
||||
"method": "tools/call",
|
||||
"params": {"name": "list_employee_publications", "arguments": {"profile_id_or_url": "stored"}},
|
||||
},
|
||||
)
|
||||
fallback_response = client.post(
|
||||
"/mcp",
|
||||
json={
|
||||
"jsonrpc": "2.0",
|
||||
"id": 2,
|
||||
"method": "tools/call",
|
||||
"params": {"name": "list_employee_publications", "arguments": {"profile_id_or_url": "fallback"}},
|
||||
},
|
||||
)
|
||||
|
||||
stored_payload = json.loads(stored_response.json()["result"]["content"][0]["text"])
|
||||
fallback_payload = json.loads(fallback_response.json()["result"]["content"][0]["text"])
|
||||
assert stored_payload["items"][0]["title"] == "Stored Publication"
|
||||
assert stored_payload["items"][0]["doi_url"] == "https://doi.org/10.1/test"
|
||||
assert stored_payload["items"][0]["annotation"] == {"ru": "Аннотация", "en": "Abstract"}
|
||||
assert stored_payload["items"][0]["authors"] == [{"id": "1", "title_ru": "Автор", "is_current_employee": True}]
|
||||
assert fallback_payload["items"][0]["title"] == "Fallback Publication"
|
||||
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
def test_mcp_sync_employees_full_empty_and_unknown_hash_modes():
|
||||
engine = create_engine(
|
||||
"sqlite:///:memory:",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import gzip
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParseResourceCache
|
||||
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication, EmployeeSnapshot, ParseResourceCache
|
||||
from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee
|
||||
from app.services.resource_cache import ResourceCache
|
||||
|
||||
@@ -191,6 +191,68 @@ def test_upsert_employee_skips_snapshot_when_checksum_is_unchanged(db_session):
|
||||
assert db_session.query(EmployeeSnapshot).count() == 1
|
||||
|
||||
|
||||
def test_upsert_employee_saves_publications_and_reuses_existing_rows(db_session):
|
||||
first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
db_session.add_all([first_run, second_run])
|
||||
db_session.commit()
|
||||
|
||||
parsed = _parsed_employee("published")
|
||||
parsed["sections"] = [
|
||||
{
|
||||
"type": "publications",
|
||||
"publications": [
|
||||
{
|
||||
"id": "888959076",
|
||||
"publication_id": "888959076",
|
||||
"title": "Detailed Publication",
|
||||
"year": 2023,
|
||||
"publication_type": "ARTICLE",
|
||||
"language": "ru",
|
||||
"status": 1,
|
||||
"url": "https://publications.hse.ru/view/888959076",
|
||||
"doi_url": "https://doi.org/10.1/test",
|
||||
"citation_text": "Detailed citation",
|
||||
"annotation": {"ru": "Аннотация"},
|
||||
"description": {"main": "Detailed citation"},
|
||||
"authors": [{"id": "1", "title_ru": "Автор"}],
|
||||
"raw_data": {"id": "888959076", "title": "Detailed Publication"},
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
employee, _ = _upsert_employee(db_session, first_run, parsed)
|
||||
db_session.commit()
|
||||
_upsert_employee(db_session, second_run, _parsed_employee_with_publication("published"))
|
||||
db_session.commit()
|
||||
|
||||
publications = db_session.query(EmployeePublication).filter_by(employee_id=employee.id).all()
|
||||
assert len(publications) == 1
|
||||
assert publications[0].doi_url == "https://doi.org/10.1/test"
|
||||
assert publications[0].authors == [{"id": "1", "title_ru": "Автор"}]
|
||||
|
||||
|
||||
def test_upsert_employee_records_publication_errors_without_failing_employee(monkeypatch, db_session):
|
||||
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
db_session.add(run)
|
||||
db_session.commit()
|
||||
|
||||
def broken_sync(*_args, **_kwargs):
|
||||
raise RuntimeError("boom")
|
||||
|
||||
monkeypatch.setattr("app.services.crawler._sync_employee_publications", broken_sync)
|
||||
|
||||
employee, changed = _upsert_employee(db_session, run, _parsed_employee_with_publication("error-safe"))
|
||||
db_session.commit()
|
||||
|
||||
assert changed is True
|
||||
assert employee.full_name == "Same Person"
|
||||
assert db_session.query(Employee).filter_by(profile_key="staff:error-safe").one()
|
||||
error = db_session.query(CrawlError).one()
|
||||
assert "публикации" in error.message.lower()
|
||||
|
||||
|
||||
def test_checksum_changes_when_widget_data_changes():
|
||||
base = _parsed_employee("widgets")
|
||||
changed = _parsed_employee("widgets")
|
||||
@@ -224,3 +286,31 @@ def _parsed_employee(profile_id: str) -> dict:
|
||||
"parser_version": "0.6.0",
|
||||
"_html": "<html></html>",
|
||||
}
|
||||
|
||||
|
||||
def _parsed_employee_with_publication(profile_id: str) -> dict:
|
||||
parsed = _parsed_employee(profile_id)
|
||||
parsed["sections"] = [
|
||||
{
|
||||
"type": "publications",
|
||||
"publications": [
|
||||
{
|
||||
"id": "888959076",
|
||||
"publication_id": "888959076",
|
||||
"title": "Detailed Publication",
|
||||
"year": 2023,
|
||||
"publication_type": "ARTICLE",
|
||||
"language": "ru",
|
||||
"status": 1,
|
||||
"url": "https://publications.hse.ru/view/888959076",
|
||||
"doi_url": "https://doi.org/10.1/test",
|
||||
"citation_text": "Detailed citation",
|
||||
"annotation": {"ru": "Аннотация"},
|
||||
"description": {"main": "Detailed citation"},
|
||||
"authors": [{"id": "1", "title_ru": "Автор"}],
|
||||
"raw_data": {"id": "888959076", "title": "Detailed Publication"},
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
return parsed
|
||||
|
||||
@@ -25,3 +25,47 @@ def test_runtime_schema_adds_skipped_count_to_existing_crawl_runs_table(monkeypa
|
||||
|
||||
columns = {column["name"] for column in inspect(engine).get_columns("crawl_runs")}
|
||||
assert "skipped_count" in columns
|
||||
|
||||
|
||||
def test_runtime_schema_creates_employee_publications_table_when_employees_exist(monkeypatch):
|
||||
engine = create_engine("sqlite:///:memory:")
|
||||
with engine.begin() as connection:
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE employees (
|
||||
id INTEGER PRIMARY KEY,
|
||||
profile_key VARCHAR(255) NOT NULL UNIQUE,
|
||||
canonical_url TEXT NOT NULL,
|
||||
status VARCHAR(32) NOT NULL DEFAULT 'active',
|
||||
first_seen_at DATETIME NOT NULL,
|
||||
last_seen_at DATETIME NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
updated_at DATETIME NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE crawl_runs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_url TEXT NOT NULL,
|
||||
status VARCHAR(32) NOT NULL DEFAULT 'running',
|
||||
found_count INTEGER NOT NULL DEFAULT 0,
|
||||
parsed_count INTEGER NOT NULL DEFAULT 0,
|
||||
skipped_count INTEGER NOT NULL DEFAULT 0
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
monkeypatch.setattr("app.db.engine", engine)
|
||||
|
||||
_ensure_runtime_schema()
|
||||
_ensure_runtime_schema()
|
||||
|
||||
inspector = inspect(engine)
|
||||
assert "employee_publications" in inspector.get_table_names()
|
||||
columns = {column["name"] for column in inspector.get_columns("employee_publications")}
|
||||
assert {"employee_id", "publication_id", "doi_url", "authors", "raw_data", "source_hash"}.issubset(columns)
|
||||
|
||||
@@ -34,7 +34,21 @@ class FakeSession:
|
||||
"type": "ARTICLE",
|
||||
"title": "Дублирование пакетов",
|
||||
"year": 2023,
|
||||
"language": {"name": "ru"},
|
||||
"status": 1,
|
||||
"authorsByType": {
|
||||
"author": [
|
||||
{
|
||||
"id": "568398853",
|
||||
"href": "/org/persons/568398853",
|
||||
"title": {"ru": "Левицкий И. А.", "en": ""},
|
||||
"reverseTitle": {"ru": "И. А. Левицкий", "en": ""},
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": {"short": {"ru": "Информационные процессы. 2023."}},
|
||||
"annotation": {"ru": "<p>Русская аннотация</p>"},
|
||||
"documents": {"DOI": {"href": "https://doi.org/10.1/test"}},
|
||||
}
|
||||
],
|
||||
},
|
||||
@@ -153,6 +167,9 @@ def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
|
||||
|
||||
assert publications["publications_count"] == 1
|
||||
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
|
||||
assert publications["publications"][0]["doi_url"] == "https://doi.org/10.1/test"
|
||||
assert publications["publications"][0]["annotation"] == {"ru": "Русская аннотация"}
|
||||
assert publications["publications"][0]["authors"][0]["is_current_employee"] is True
|
||||
assert theses["theses_count"] == 1
|
||||
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
|
||||
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
|
||||
|
||||
Reference in New Issue
Block a user