Compare commits

...

6 Commits

16 changed files with 761 additions and 14 deletions

1
.gitignore vendored
View File

@@ -8,3 +8,4 @@ pytest-cache-files-*/
.coverage .coverage
htmlcov/ htmlcov/
postgres_data/ postgres_data/
MCP_DESCRIPTION.md

View File

@@ -17,6 +17,7 @@ from app.services.admin_data import (
stats_payload, stats_payload,
) )
from app.services.crawl_control import get_running_run, run_crawl_if_idle from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.services.crawler import refresh_employee
from app.version import BACKEND_VERSION, FRONTEND_VERSION from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/admin") router = APIRouter(prefix="/admin")
@@ -144,10 +145,31 @@ def employee_detail(
return _render( return _render(
request, request,
"employee_detail.html", "employee_detail.html",
{"employee": employee, "employee_view": employee_detail_payload(employee), "snapshots": snapshots}, {
"employee": employee,
"employee_view": employee_detail_payload(employee),
"snapshots": snapshots,
"refresh_status": request.query_params.get("refresh_status"),
},
) )
@router.post("/employees/{employee_id}/refresh")
def refresh_employee_detail(
employee_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
employee = db.get(Employee, employee_id)
if not employee:
return RedirectResponse("/admin/directory", status_code=303)
run = refresh_employee(db, employee, settings)
status = "success" if run.status == "completed" else "error"
return RedirectResponse(f"/admin/employees/{employee_id}?refresh_status={status}", status_code=303)
@router.get("/runs", response_class=HTMLResponse) @router.get("/runs", response_class=HTMLResponse)
def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)): def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings) require_admin(request, settings)

View File

@@ -7,12 +7,31 @@ from sqlalchemy.orm import Session
from app.db import get_db from app.db import get_db
from app.models import CrawlRun, Employee from app.models import CrawlRun, Employee
from app.services.admin_data import run_detail_payload from app.services.admin_data import run_detail_payload
from app.services.dataset_versions import service_info_payload, sync_employees_payload
from app.version import BACKEND_VERSION from app.version import BACKEND_VERSION
router = APIRouter(prefix="/mcp") router = APIRouter(prefix="/mcp")
PROTOCOL_VERSION = "2024-11-05"
SERVICE_NAME = "miem-employees"
TOOLS = [ TOOLS = [
{
"name": "get_service_info",
"description": "Return service metadata, supported tools, and current dataset version.",
"inputSchema": {"type": "object", "properties": {}},
},
{
"name": "sync_employees",
"description": "Synchronize employees by dataset hash. Returns a full snapshot or a delta from client_hash.",
"inputSchema": {
"type": "object",
"properties": {
"client_hash": {"type": "string"},
"include_data": {"type": "boolean", "default": True},
},
},
},
{ {
"name": "search_employees", "name": "search_employees",
"description": "Search MIEM employees by name or profile URL.", "description": "Search MIEM employees by name or profile URL.",
@@ -71,8 +90,8 @@ async def mcp_http(
try: try:
if method == "initialize": if method == "initialize":
result = { result = {
"protocolVersion": "2024-11-05", "protocolVersion": PROTOCOL_VERSION,
"serverInfo": {"name": "miem-employees", "version": BACKEND_VERSION}, "serverInfo": {"name": SERVICE_NAME, "version": BACKEND_VERSION},
"capabilities": {"tools": {}}, "capabilities": {"tools": {}},
} }
elif method == "tools/list": elif method == "tools/list":
@@ -87,6 +106,24 @@ async def mcp_http(
def _call_tool(db: Session, name: str, arguments: dict) -> dict: def _call_tool(db: Session, name: str, arguments: dict) -> dict:
if name == "get_service_info":
return _tool_response(
service_info_payload(
db,
tools=TOOLS,
service_name=SERVICE_NAME,
backend_version=BACKEND_VERSION,
protocol_version=PROTOCOL_VERSION,
)
)
if name == "sync_employees":
return _tool_response(
sync_employees_payload(
db,
client_hash=arguments.get("client_hash"),
include_data=bool(arguments.get("include_data", True)),
)
)
if name == "search_employees": if name == "search_employees":
return _tool_response(_search_employees(db, arguments)) return _tool_response(_search_employees(db, arguments))
if name == "get_employee": if name == "get_employee":

View File

@@ -76,6 +76,7 @@ class CrawlRun(Base):
message: Mapped[str | None] = mapped_column(Text) message: Mapped[str | None] = mapped_column(Text)
employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run") employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run")
dataset_versions: Mapped[list["DatasetVersion"]] = relationship(back_populates="crawl_run")
class CrawlRunEmployeeChange(Base): class CrawlRunEmployeeChange(Base):
@@ -134,3 +135,42 @@ class ParserSource(Base):
source_url: Mapped[str] = mapped_column(Text, nullable=False) source_url: Mapped[str] = mapped_column(Text, nullable=False)
enabled: Mapped[bool] = mapped_column(default=True, nullable=False) enabled: Mapped[bool] = mapped_column(default=True, nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
class DatasetVersion(Base):
__tablename__ = "dataset_versions"
__table_args__ = (
UniqueConstraint("hash", name="uq_dataset_versions_hash"),
Index("ix_dataset_versions_created_at", "created_at"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
hash: Mapped[str] = mapped_column(String(64), nullable=False)
previous_hash: Mapped[str | None] = mapped_column(String(64))
crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id"))
employee_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
active_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
crawl_run: Mapped[CrawlRun | None] = relationship(back_populates="dataset_versions")
items: Mapped[list["DatasetVersionItem"]] = relationship(back_populates="dataset_version", cascade="all, delete-orphan")
class DatasetVersionItem(Base):
__tablename__ = "dataset_version_items"
__table_args__ = (
UniqueConstraint("dataset_version_id", "profile_key", name="uq_dataset_version_items_version_profile"),
Index("ix_dataset_version_items_hash", "dataset_version_id"),
Index("ix_dataset_version_items_profile_key", "profile_key"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
dataset_version_id: Mapped[int] = mapped_column(ForeignKey("dataset_versions.id"), nullable=False)
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
status: Mapped[str] = mapped_column(String(32), nullable=False)
checksum: Mapped[str] = mapped_column(String(64), nullable=False)
dataset_version: Mapped[DatasetVersion] = relationship(back_populates="items")
employee: Mapped[Employee | None] = relationship()

View File

@@ -387,7 +387,7 @@ def _infer_section_type(title: str, nodes: list) -> str:
lowered = title.lower() lowered = title.lower()
if _has_table(nodes): if _has_table(nodes):
return "table" return "table"
if "публикац" in lowered: if _is_publications_title(lowered):
return "publications" return "publications"
if "учебные курсы" in lowered: if "учебные курсы" in lowered:
return "courses_by_year" return "courses_by_year"
@@ -398,6 +398,10 @@ def _infer_section_type(title: str, nodes: list) -> str:
return "generic" return "generic"
def _is_publications_title(lowered_title: str) -> bool:
return lowered_title.startswith("публикац")
def _has_table(nodes: list) -> bool: def _has_table(nodes: list) -> bool:
return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes) return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes)

View File

@@ -13,6 +13,7 @@ from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, E
from app.parser.collector import collect_profile_links from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key from app.parser.profile_url import profile_key
from app.services.dataset_versions import get_or_create_current_version
HEADERS = { HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)" "User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
@@ -70,6 +71,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout) run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.status = "completed" run.status = "completed"
get_or_create_current_version(db, crawl_run_id=run.id)
except Exception as exc: except Exception as exc:
run.status = "failed" run.status = "failed"
run.message = str(exc) run.message = str(exc)
@@ -80,6 +82,49 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
return run return run
def refresh_employee(db: Session, employee: Employee, settings: Settings) -> CrawlRun:
run = CrawlRun(source_url=employee.canonical_url, status="running", found_count=1)
db.add(run)
db.commit()
db.refresh(run)
try:
with requests.Session() as session:
parsed = parse_person_profile(
session,
employee.canonical_url,
HEADERS,
settings.request_timeout,
settings.parser_use_playwright,
)
if not parsed:
raise ValueError("Профиль не удалось распарсить.")
if _parsed_profile_key(parsed) != employee.profile_key:
raise ValueError("Распарсенный профиль не совпадает с обновляемым сотрудником.")
_upsert_employee(db, run, parsed)
run.parsed_count = 1
run.status = "completed"
get_or_create_current_version(db, crawl_run_id=run.id)
except Exception as exc:
run.status = "failed"
run.error_count = 1
run.message = str(exc)
db.add(
CrawlError(
crawl_run_id=run.id,
profile_url=employee.canonical_url,
error_type=type(exc).__name__,
message=str(exc),
)
)
finally:
run.finished_at = datetime.now(timezone.utc)
db.commit()
db.refresh(run)
return run
def _ensure_source(db: Session, source_url: str) -> ParserSource: def _ensure_source(db: Session, source_url: str) -> ParserSource:
source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url)) source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url))
if source: if source:
@@ -91,10 +136,14 @@ def _ensure_source(db: Session, source_url: str) -> ParserSource:
return source return source
def _parsed_profile_key(parsed: dict) -> str:
return f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee: def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
html = parsed.pop("_html", None) html = parsed.pop("_html", None)
checksum = _checksum(parsed) checksum = _checksum(parsed)
key = f"{parsed.get('profile_type')}:{parsed.get('profile_id')}" key = _parsed_profile_key(parsed)
employee = db.scalar(select(Employee).where(Employee.profile_key == key)) employee = db.scalar(select(Employee).where(Employee.profile_key == key))
now = datetime.now(timezone.utc) now = datetime.now(timezone.utc)
if not employee: if not employee:

View File

@@ -0,0 +1,227 @@
import hashlib
import json
from dataclasses import dataclass
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.models import DatasetVersion, DatasetVersionItem, Employee
@dataclass(frozen=True)
class EmployeeMarker:
profile_key: str
employee_id: int | None
status: str
checksum: str
def get_or_create_current_version(db: Session, *, crawl_run_id: int | None = None) -> DatasetVersion:
employees = db.scalars(select(Employee).order_by(Employee.profile_key)).all()
markers = [_employee_marker(employee) for employee in employees]
dataset_hash = _dataset_hash(markers)
latest = get_latest_version(db)
if latest and latest.hash == dataset_hash:
return latest
active_count = sum(1 for marker in markers if marker.status == "active")
dismissed_count = sum(1 for marker in markers if marker.status == "dismissed")
version = DatasetVersion(
hash=dataset_hash,
previous_hash=latest.hash if latest else None,
crawl_run_id=crawl_run_id,
employee_count=len(markers),
active_count=active_count,
dismissed_count=dismissed_count,
)
db.add(version)
db.flush()
for marker in markers:
db.add(
DatasetVersionItem(
dataset_version_id=version.id,
profile_key=marker.profile_key,
employee_id=marker.employee_id,
status=marker.status,
checksum=marker.checksum,
)
)
db.flush()
return version
def get_latest_version(db: Session) -> DatasetVersion | None:
return db.scalar(select(DatasetVersion).order_by(desc(DatasetVersion.created_at), desc(DatasetVersion.id)).limit(1))
def get_version_by_hash(db: Session, dataset_hash: str | None) -> DatasetVersion | None:
if not dataset_hash:
return None
return db.scalar(select(DatasetVersion).where(DatasetVersion.hash == dataset_hash).limit(1))
def service_info_payload(db: Session, *, tools: list[dict], service_name: str, backend_version: str, protocol_version: str) -> dict:
version = get_or_create_current_version(db)
db.commit()
return {
"service_name": service_name,
"backend_version": backend_version,
"protocolVersion": protocol_version,
"tools": tools,
"dataset": _version_payload(version),
}
def sync_employees_payload(db: Session, *, client_hash: str | None = None, include_data: bool = True) -> dict:
current = get_or_create_current_version(db)
db.commit()
if not client_hash:
return _full_sync_payload(db, current, include_data=include_data, reason=None)
if client_hash == current.hash:
return {
"mode": "delta",
"from_hash": client_hash,
"to_hash": current.hash,
"dataset": _version_payload(current),
"changes": {"added": [], "updated": [], "dismissed": [], "removed": []},
}
previous = get_version_by_hash(db, client_hash)
if not previous:
return _full_sync_payload(db, current, include_data=include_data, reason="unknown_client_hash", from_hash=client_hash)
return _delta_sync_payload(db, previous, current, include_data=include_data)
def _full_sync_payload(
db: Session,
current: DatasetVersion,
*,
include_data: bool,
reason: str | None,
from_hash: str | None = None,
) -> dict:
employees = db.scalars(select(Employee).order_by(Employee.profile_key)).all()
payload = {
"mode": "full",
"from_hash": from_hash,
"to_hash": current.hash,
"dataset": _version_payload(current),
"items": [_employee_payload(employee, include_data=include_data) for employee in employees],
}
if reason:
payload["reason"] = reason
return payload
def _delta_sync_payload(db: Session, previous: DatasetVersion, current: DatasetVersion, *, include_data: bool) -> dict:
previous_items = _items_by_profile_key(previous)
current_items = _items_by_profile_key(current)
employees = {employee.profile_key: employee for employee in db.scalars(select(Employee)).all()}
added = []
updated = []
dismissed = []
removed = []
for profile_key, current_item in sorted(current_items.items()):
previous_item = previous_items.get(profile_key)
employee = employees.get(profile_key)
if not previous_item:
if employee:
added.append(_employee_payload(employee, include_data=include_data))
continue
if previous_item.checksum == current_item.checksum and previous_item.status == current_item.status:
continue
if current_item.status == "dismissed":
dismissed.append(_tombstone(profile_key, current_item.status, employee))
elif employee:
updated.append(_employee_payload(employee, include_data=include_data))
for profile_key, previous_item in sorted(previous_items.items()):
if profile_key not in current_items:
removed.append(_tombstone(profile_key, "removed", employees.get(profile_key), checksum=previous_item.checksum))
return {
"mode": "delta",
"from_hash": previous.hash,
"to_hash": current.hash,
"dataset": _version_payload(current),
"changes": {
"added": added,
"updated": updated,
"dismissed": dismissed,
"removed": removed,
},
}
def _items_by_profile_key(version: DatasetVersion) -> dict[str, DatasetVersionItem]:
return {item.profile_key: item for item in version.items}
def _version_payload(version: DatasetVersion) -> dict:
return {
"hash": version.hash,
"previous_hash": version.previous_hash,
"created_at": version.created_at.isoformat() if version.created_at else None,
"crawl_run_id": version.crawl_run_id,
"employee_count": version.employee_count,
"active_count": version.active_count,
"dismissed_count": version.dismissed_count,
}
def _employee_marker(employee: Employee) -> EmployeeMarker:
return EmployeeMarker(
profile_key=employee.profile_key,
employee_id=employee.id,
status=employee.status,
checksum=employee.current_checksum or _payload_hash(employee.current_data or {}),
)
def _dataset_hash(markers: list[EmployeeMarker]) -> str:
payload = [
{"profile_key": marker.profile_key, "status": marker.status, "checksum": marker.checksum}
for marker in sorted(markers, key=lambda item: item.profile_key)
]
return _payload_hash(payload)
def _payload_hash(value: object) -> str:
payload = json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
def _employee_payload(employee: Employee, *, include_data: bool) -> dict:
payload = {
"profile_key": employee.profile_key,
"profile_id": employee.profile_id,
"full_name": employee.full_name,
"status": employee.status,
"canonical_url": employee.canonical_url,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
"checksum": employee.current_checksum or _payload_hash(employee.current_data or {}),
}
if include_data:
payload["data"] = employee.current_data
return payload
def _tombstone(profile_key: str, status: str, employee: Employee | None, *, checksum: str | None = None) -> dict:
payload = {
"profile_key": profile_key,
"status": status,
"checksum": checksum or (employee.current_checksum if employee else None),
}
if employee:
payload.update(
{
"profile_id": employee.profile_id,
"full_name": employee.full_name,
"canonical_url": employee.canonical_url,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
}
)
return payload

View File

@@ -171,6 +171,10 @@
background: transparent; background: transparent;
} }
.button--compact {
padding: 8px 12px;
}
.code { .code {
overflow-x: auto; overflow-x: auto;
padding: 14px; padding: 14px;
@@ -201,11 +205,34 @@
gap: 10px; gap: 10px;
} }
.employee-card__actions {
display: grid;
justify-items: end;
gap: 10px;
}
.employee-card__title { .employee-card__title {
margin: 0; margin: 0;
font-size: 24px; font-size: 24px;
} }
.employee-card__notice {
margin: 0;
padding: 12px 14px;
border-radius: 8px;
font-weight: 700;
}
.employee-card__notice--success {
color: #065f46;
background: #d1fae5;
}
.employee-card__notice--error {
color: #991b1b;
background: #fee2e2;
}
.employee-card__section { .employee-card__section {
padding: 20px; padding: 20px;
background: #ffffff; background: #ffffff;

View File

@@ -7,8 +7,18 @@
<h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2> <h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2>
<span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status_display }}</span> <span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status_display }}</span>
</div> </div>
<a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a> <div class="employee-card__actions">
<form method="post" action="/admin/employees/{{ employee.id }}/refresh">
<button class="button button--compact" type="submit">Обновить данные</button>
</form>
<a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a>
</div>
</div> </div>
{% if refresh_status == "success" %}
<p class="employee-card__notice employee-card__notice--success">Данные сотрудника обновлены.</p>
{% elif refresh_status == "error" %}
<p class="employee-card__notice employee-card__notice--error">Не удалось обновить данные сотрудника.</p>
{% endif %}
<section class="employee-card__section"> <section class="employee-card__section">
<h3 class="employee-section__title">Основная информация</h3> <h3 class="employee-section__title">Основная информация</h3>

View File

@@ -1,3 +1,3 @@
APP_VERSION = "0.4.6" APP_VERSION = "0.5.0"
FRONTEND_VERSION = "0.4.6" FRONTEND_VERSION = "0.5.0"
BACKEND_VERSION = "0.4.6" BACKEND_VERSION = "0.5.0"

View File

@@ -0,0 +1,29 @@
CREATE TABLE IF NOT EXISTS dataset_versions (
id SERIAL PRIMARY KEY,
hash VARCHAR(64) NOT NULL UNIQUE,
previous_hash VARCHAR(64),
crawl_run_id INTEGER REFERENCES crawl_runs(id),
employee_count INTEGER NOT NULL DEFAULT 0,
active_count INTEGER NOT NULL DEFAULT 0,
dismissed_count INTEGER NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS ix_dataset_versions_created_at
ON dataset_versions (created_at);
CREATE TABLE IF NOT EXISTS dataset_version_items (
id SERIAL PRIMARY KEY,
dataset_version_id INTEGER NOT NULL REFERENCES dataset_versions(id),
profile_key VARCHAR(255) NOT NULL,
employee_id INTEGER REFERENCES employees(id),
status VARCHAR(32) NOT NULL,
checksum VARCHAR(64) NOT NULL,
CONSTRAINT uq_dataset_version_items_version_profile UNIQUE (dataset_version_id, profile_key)
);
CREATE INDEX IF NOT EXISTS ix_dataset_version_items_hash
ON dataset_version_items (dataset_version_id);
CREATE INDEX IF NOT EXISTS ix_dataset_version_items_profile_key
ON dataset_version_items (profile_key);

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "miem-workers" name = "miem-workers"
version = "0.4.6" version = "0.5.0"
description = "MIEM employees parser, admin API, and MCP server" description = "MIEM employees parser, admin API, and MCP server"
requires-python = ">=3.11" requires-python = ">=3.11"
dependencies = [ dependencies = [

View File

@@ -1,7 +1,9 @@
import json
from datetime import datetime, timezone from datetime import datetime, timezone
from types import SimpleNamespace
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from sqlalchemy import create_engine from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool from sqlalchemy.pool import StaticPool
@@ -18,7 +20,7 @@ def test_health_returns_versions():
response = client.get("/api/health") response = client.get("/api/health")
assert response.status_code == 200 assert response.status_code == 200
assert response.json()["backend_version"] == "0.4.6" assert response.json()["backend_version"] == "0.5.0"
def test_mcp_lists_tools_without_auth_and_ignores_auth_header(): def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
@@ -49,7 +51,10 @@ def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
assert without_auth.status_code == 200 assert without_auth.status_code == 200
assert with_auth.status_code == 200 assert with_auth.status_code == 200
assert without_auth.json()["result"]["tools"][0]["name"] == "search_employees" tool_names = {tool["name"] for tool in without_auth.json()["result"]["tools"]}
assert "search_employees" in tool_names
assert "get_service_info" in tool_names
assert "sync_employees" in tool_names
assert any(tool["name"] == "get_crawl_run_details" for tool in without_auth.json()["result"]["tools"]) assert any(tool["name"] == "get_crawl_run_details" for tool in without_auth.json()["result"]["tools"])
assert with_auth.json()["result"]["tools"] == without_auth.json()["result"]["tools"] assert with_auth.json()["result"]["tools"] == without_auth.json()["result"]["tools"]
@@ -107,6 +112,128 @@ def test_mcp_search_employees_returns_matching_employee():
app.dependency_overrides.clear() app.dependency_overrides.clear()
def test_mcp_service_info_returns_tools_and_dataset_hash():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
current_checksum="a" * 64,
current_data={"sections": []},
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
client = TestClient(app)
response = client.post(
"/mcp",
json={"jsonrpc": "2.0", "id": 1, "method": "tools/call", "params": {"name": "get_service_info", "arguments": {}}},
)
assert response.status_code == 200
payload = json.loads(response.json()["result"]["content"][0]["text"])
assert payload["service_name"] == "miem-employees"
assert payload["backend_version"] == "0.5.0"
assert payload["dataset"]["hash"]
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])
app.dependency_overrides.clear()
def test_mcp_sync_employees_full_empty_and_unknown_hash_modes():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
current_checksum="a" * 64,
current_data={"sections": [{"type": "paragraphs"}]},
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
client = TestClient(app)
full_response = client.post(
"/mcp",
json={"jsonrpc": "2.0", "id": 1, "method": "tools/call", "params": {"name": "sync_employees", "arguments": {}}},
)
full_payload = json.loads(full_response.json()["result"]["content"][0]["text"])
current_hash = full_payload["to_hash"]
empty_response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 2,
"method": "tools/call",
"params": {"name": "sync_employees", "arguments": {"client_hash": current_hash}},
},
)
empty_payload = json.loads(empty_response.json()["result"]["content"][0]["text"])
unknown_response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 3,
"method": "tools/call",
"params": {"name": "sync_employees", "arguments": {"client_hash": "missing"}},
},
)
unknown_payload = json.loads(unknown_response.json()["result"]["content"][0]["text"])
assert full_payload["mode"] == "full"
assert full_payload["items"][0]["data"] == {"sections": [{"type": "paragraphs"}]}
assert empty_payload["mode"] == "delta"
assert empty_payload["changes"] == {"added": [], "updated": [], "dismissed": [], "removed": []}
assert unknown_payload["mode"] == "full"
assert unknown_payload["reason"] == "unknown_client_hash"
app.dependency_overrides.clear()
def test_mcp_get_crawl_run_details_returns_changes(): def test_mcp_get_crawl_run_details_returns_changes():
engine = create_engine( engine = create_engine(
"sqlite:///:memory:", "sqlite:///:memory:",
@@ -248,3 +375,56 @@ def test_api_employees_and_stats_require_admin_session():
assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person" assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person"
app.dependency_overrides.clear() app.dependency_overrides.clear()
def test_admin_refresh_employee_route_updates_only_requested_employee(monkeypatch):
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()
db.add(
Employee(
profile_key="org_person:133709486",
profile_type="org_person",
profile_id="133709486",
canonical_url="https://www.hse.ru/org/persons/133709486",
full_name="Будков Юрий Алексеевич",
status="active",
)
)
db.commit()
employee_id = db.scalar(select(Employee.id))
db.close()
settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
def override_db():
session = Session()
try:
yield session
finally:
session.close()
calls = []
def fake_refresh_employee(db, refreshed_employee, route_settings):
calls.append((refreshed_employee.id, route_settings))
return SimpleNamespace(status="completed")
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: settings
monkeypatch.setattr("app.admin.refresh_employee", fake_refresh_employee)
client = TestClient(app)
client.cookies.set(SESSION_COOKIE, sign_session("admin", settings))
response = client.post(f"/admin/employees/{employee_id}/refresh", follow_redirects=False)
assert response.status_code == 303
assert response.headers["location"] == f"/admin/employees/{employee_id}?refresh_status=success"
assert calls == [(employee_id, settings)]
app.dependency_overrides.clear()

View File

@@ -0,0 +1,88 @@
from datetime import datetime, timezone
from app.models import Employee
from app.services.dataset_versions import get_or_create_current_version, sync_employees_payload
def _employee(profile_key: str, checksum: str, *, status: str = "active") -> Employee:
return Employee(
profile_key=profile_key,
profile_type=profile_key.split(":", 1)[0],
profile_id=profile_key.split(":", 1)[1],
canonical_url=f"https://www.hse.ru/{profile_key}",
full_name=profile_key,
status=status,
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"profile_key": profile_key},
current_checksum=checksum,
)
def test_dataset_version_hash_is_stable_for_same_employee_state(db_session):
db_session.add(_employee("staff:alpha", "a" * 64))
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
second = get_or_create_current_version(db_session)
assert second.id == first.id
assert second.hash == first.hash
assert second.employee_count == 1
def test_dataset_version_hash_changes_when_employee_checksum_changes(db_session):
employee = _employee("staff:alpha", "a" * 64)
db_session.add(employee)
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
employee.current_checksum = "b" * 64
db_session.commit()
second = get_or_create_current_version(db_session)
assert second.hash != first.hash
assert second.previous_hash == first.hash
def test_sync_employees_diff_spans_multiple_intermediate_versions(db_session):
alpha = _employee("staff:alpha", "a" * 64)
db_session.add(alpha)
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
beta = _employee("staff:beta", "b" * 64)
db_session.add(beta)
db_session.commit()
get_or_create_current_version(db_session)
db_session.commit()
alpha.current_checksum = "c" * 64
alpha.current_data = {"profile_key": "staff:alpha", "changed": True}
db_session.commit()
payload = sync_employees_payload(db_session, client_hash=first.hash, include_data=False)
assert payload["mode"] == "delta"
assert [item["profile_key"] for item in payload["changes"]["added"]] == ["staff:beta"]
assert [item["profile_key"] for item in payload["changes"]["updated"]] == ["staff:alpha"]
assert payload["changes"]["dismissed"] == []
assert payload["changes"]["removed"] == []
def test_sync_employees_reports_dismissed_as_tombstone(db_session):
alpha = _employee("staff:alpha", "a" * 64)
db_session.add(alpha)
db_session.commit()
first = get_or_create_current_version(db_session)
db_session.commit()
alpha.status = "dismissed"
db_session.commit()
payload = sync_employees_payload(db_session, client_hash=first.hash, include_data=False)
assert payload["changes"]["dismissed"][0]["profile_key"] == "staff:alpha"
assert payload["changes"]["dismissed"][0]["status"] == "dismissed"

View File

@@ -27,4 +27,6 @@ def test_employee_detail_template_is_human_readable():
assert "Дата увольнения" in template assert "Дата увольнения" in template
assert "Тип профиля" in template assert "Тип профиля" in template
assert "ID профиля" in template assert "ID профиля" in template
assert "Обновить данные" in template
assert 'action="/admin/employees/{{ employee.id }}/refresh"' in template
assert "Снапшоты" in template assert "Снапшоты" in template

View File

@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs, extract_sections
from app.parser.profile_url import normalize_profile_url, parse_profile_identity from app.parser.profile_url import normalize_profile_url, parse_profile_identity
@@ -184,3 +184,34 @@ def test_enrich_sections_from_hse_widgets_loads_grouped_publications():
assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"] assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"]
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790" assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790"
assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323" assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323"
def test_news_heading_with_publications_word_does_not_absorb_widget_publications():
soup = BeautifulSoup(
"""
<h2>Статья профессора МИЭМ вошла в число самых популярных публикаций на портале SpringerLink</h2>
<div class="post__text">
<p>Первоначально статья профессора вышла в российском журнале.</p>
</div>
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
""",
"html.parser",
)
session = FakeSession()
sections = extract_sections(soup, "https://www.hse.ru/org/persons/133709486")
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/133709486",
{"User-Agent": "test"},
10,
sections,
)
assert sections[0]["type"] == "paragraphs"
assert sections[0]["title"].startswith("Статья профессора")
publications = [section for section in sections if section["type"] == "publications"]
assert len(publications) == 1
assert publications[0]["title"] == "Публикации и исследования"
assert publications[0]["publications_count"] == 1