feat: add dataset checkpoint sync for MCP

This commit is contained in:
Anton
2026-05-14 11:00:46 +03:00
parent a3ff9c6e9c
commit 29451ccee1
9 changed files with 558 additions and 8 deletions

View File

@@ -1,3 +1,4 @@
import json
from datetime import datetime, timezone
from types import SimpleNamespace
@@ -19,7 +20,7 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.4.7"
assert response.json()["backend_version"] == "0.5.0"
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
@@ -50,7 +51,10 @@ def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
assert without_auth.status_code == 200
assert with_auth.status_code == 200
assert without_auth.json()["result"]["tools"][0]["name"] == "search_employees"
tool_names = {tool["name"] for tool in without_auth.json()["result"]["tools"]}
assert "search_employees" in tool_names
assert "get_service_info" in tool_names
assert "sync_employees" in tool_names
assert any(tool["name"] == "get_crawl_run_details" for tool in without_auth.json()["result"]["tools"])
assert with_auth.json()["result"]["tools"] == without_auth.json()["result"]["tools"]
@@ -108,6 +112,128 @@ def test_mcp_search_employees_returns_matching_employee():
app.dependency_overrides.clear()
def test_mcp_service_info_returns_tools_and_dataset_hash():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
current_checksum="a" * 64,
current_data={"sections": []},
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
client = TestClient(app)
response = client.post(
"/mcp",
json={"jsonrpc": "2.0", "id": 1, "method": "tools/call", "params": {"name": "get_service_info", "arguments": {}}},
)
assert response.status_code == 200
payload = json.loads(response.json()["result"]["content"][0]["text"])
assert payload["service_name"] == "miem-employees"
assert payload["backend_version"] == "0.5.0"
assert payload["dataset"]["hash"]
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])
app.dependency_overrides.clear()
def test_mcp_sync_employees_full_empty_and_unknown_hash_modes():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
current_checksum="a" * 64,
current_data={"sections": [{"type": "paragraphs"}]},
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
client = TestClient(app)
full_response = client.post(
"/mcp",
json={"jsonrpc": "2.0", "id": 1, "method": "tools/call", "params": {"name": "sync_employees", "arguments": {}}},
)
full_payload = json.loads(full_response.json()["result"]["content"][0]["text"])
current_hash = full_payload["to_hash"]
empty_response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 2,
"method": "tools/call",
"params": {"name": "sync_employees", "arguments": {"client_hash": current_hash}},
},
)
empty_payload = json.loads(empty_response.json()["result"]["content"][0]["text"])
unknown_response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 3,
"method": "tools/call",
"params": {"name": "sync_employees", "arguments": {"client_hash": "missing"}},
},
)
unknown_payload = json.loads(unknown_response.json()["result"]["content"][0]["text"])
assert full_payload["mode"] == "full"
assert full_payload["items"][0]["data"] == {"sections": [{"type": "paragraphs"}]}
assert empty_payload["mode"] == "delta"
assert empty_payload["changes"] == {"added": [], "updated": [], "dismissed": [], "removed": []}
assert unknown_payload["mode"] == "full"
assert unknown_payload["reason"] == "unknown_client_hash"
app.dependency_overrides.clear()
def test_mcp_get_crawl_run_details_returns_changes():
engine = create_engine(
"sqlite:///:memory:",

View File

@@ -0,0 +1,88 @@
from datetime import datetime, timezone
from app.models import Employee
from app.services.dataset_versions import get_or_create_current_version, sync_employees_payload
def _employee(profile_key: str, checksum: str, *, status: str = "active") -> Employee:
return Employee(
profile_key=profile_key,
profile_type=profile_key.split(":", 1)[0],
profile_id=profile_key.split(":", 1)[1],
canonical_url=f"https://www.hse.ru/{profile_key}",
full_name=profile_key,
status=status,
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"profile_key": profile_key},
current_checksum=checksum,
)
def test_dataset_version_hash_is_stable_for_same_employee_state(db_session):
db_session.add(_employee("staff:alpha", "a" * 64))
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
second = get_or_create_current_version(db_session)
assert second.id == first.id
assert second.hash == first.hash
assert second.employee_count == 1
def test_dataset_version_hash_changes_when_employee_checksum_changes(db_session):
employee = _employee("staff:alpha", "a" * 64)
db_session.add(employee)
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
employee.current_checksum = "b" * 64
db_session.commit()
second = get_or_create_current_version(db_session)
assert second.hash != first.hash
assert second.previous_hash == first.hash
def test_sync_employees_diff_spans_multiple_intermediate_versions(db_session):
alpha = _employee("staff:alpha", "a" * 64)
db_session.add(alpha)
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
beta = _employee("staff:beta", "b" * 64)
db_session.add(beta)
db_session.commit()
get_or_create_current_version(db_session)
db_session.commit()
alpha.current_checksum = "c" * 64
alpha.current_data = {"profile_key": "staff:alpha", "changed": True}
db_session.commit()
payload = sync_employees_payload(db_session, client_hash=first.hash, include_data=False)
assert payload["mode"] == "delta"
assert [item["profile_key"] for item in payload["changes"]["added"]] == ["staff:beta"]
assert [item["profile_key"] for item in payload["changes"]["updated"]] == ["staff:alpha"]
assert payload["changes"]["dismissed"] == []
assert payload["changes"]["removed"] == []
def test_sync_employees_reports_dismissed_as_tombstone(db_session):
alpha = _employee("staff:alpha", "a" * 64)
db_session.add(alpha)
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
alpha.status = "dismissed"
db_session.commit()
payload = sync_employees_payload(db_session, client_hash=first.hash, include_data=False)
assert payload["changes"]["dismissed"][0]["profile_key"] == "staff:alpha"
assert payload["changes"]["dismissed"][0]["status"] == "dismissed"