From d0459a2c3018929491a4c828e1792a191d8640d4 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 6 May 2026 15:13:15 +0300 Subject: [PATCH] feat: track crawl run employee changes and verify dismissals --- README.md | 2 +- app/admin.py | 23 ++++- app/api.py | 16 +++- app/mcp.py | 16 +++- app/models.py | 26 +++++ app/services/admin_data.py | 68 ++++++++++++- app/services/crawler.py | 69 +++++++++++++- app/templates/run_detail.html | 64 +++++++++++++ app/templates/runs.html | 2 +- app/version.py | 6 +- migrations/003_crawl_run_employee_changes.sql | 21 ++++ pyproject.toml | 2 +- tests/test_admin_data.py | 43 ++++++++- tests/test_admin_templates.py | 16 ++++ tests/test_api_mcp.py | 95 ++++++++++++++++++- tests/test_crawler.py | 75 +++++++++++++-- 16 files changed, 517 insertions(+), 27 deletions(-) create mode 100644 app/templates/run_detail.html create mode 100644 migrations/003_crawl_run_employee_changes.sql diff --git a/README.md b/README.md index de0a8d9..d6b61c7 100644 --- a/README.md +++ b/README.md @@ -131,4 +131,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql docker compose down ``` -Версия сервиса: `0.3.0`. Админка всегда показывает версии backend и frontend в footer. +Версия сервиса: `0.4.0`. Админка всегда показывает версии backend и frontend в footer. diff --git a/app/admin.py b/app/admin.py index 5fba191..0348ff4 100644 --- a/app/admin.py +++ b/app/admin.py @@ -8,7 +8,14 @@ from app.config import Settings, get_settings from app.db import SessionLocal, get_db from app.models import CrawlError, CrawlRun, Employee from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin -from app.services.admin_data import employee_detail_payload, format_admin_datetime, list_employees_page, run_payload, stats_payload +from app.services.admin_data import ( + employee_detail_payload, + format_admin_datetime, + list_employees_page, + run_detail_payload, + run_payload, + stats_payload, +) from app.services.crawl_control import get_running_run, run_crawl_if_idle from app.version import BACKEND_VERSION, FRONTEND_VERSION @@ -150,6 +157,20 @@ def runs(request: Request, db: Session = Depends(get_db), settings: Settings = D return _render(request, "runs.html", {"runs": items, "errors": errors}) +@router.get("/runs/{run_id}", response_class=HTMLResponse) +def run_detail( + run_id: int, + request: Request, + db: Session = Depends(get_db), + settings: Settings = Depends(get_settings), +): + require_admin(request, settings) + run = db.get(CrawlRun, run_id) + if not run: + return RedirectResponse("/admin/runs", status_code=303) + return _render(request, "run_detail.html", {"run": run_detail_payload(db, run)}) + + @router.post("/runs") def trigger_run( request: Request, diff --git a/app/api.py b/app/api.py index 12f3a78..37b65fa 100644 --- a/app/api.py +++ b/app/api.py @@ -8,7 +8,7 @@ from app.config import Settings, get_settings from app.db import SessionLocal, get_db from app.models import CrawlRun, Employee from app.security import require_admin -from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload +from app.services.admin_data import employee_display_payload, list_employees_page, run_detail_payload, run_payload, stats_payload from app.services.crawl_control import get_running_run, run_crawl_if_idle from app.version import BACKEND_VERSION, FRONTEND_VERSION @@ -88,6 +88,20 @@ def latest_crawl_run( return {"running": run_payload(running), "latest": run_payload(latest)} +@router.get("/crawl-runs/{run_id}") +def get_crawl_run( + run_id: int, + request: Request, + db: Session = Depends(get_db), + settings: Settings = Depends(get_settings), +) -> dict: + require_admin(request, settings) + run = db.get(CrawlRun, run_id) + if not run: + return {"error": "not_found"} + return run_detail_payload(db, run) or {"error": "not_found"} + + @router.post("/crawl-runs") def trigger_crawl( request: Request, diff --git a/app/mcp.py b/app/mcp.py index ed5a311..1725b35 100644 --- a/app/mcp.py +++ b/app/mcp.py @@ -8,6 +8,8 @@ from app.config import Settings, get_settings from app.db import get_db from app.models import CrawlRun, Employee from app.security import mcp_protected_resource_metadata, require_mcp_auth +from app.services.admin_data import run_detail_payload +from app.version import BACKEND_VERSION router = APIRouter(prefix="/mcp") metadata_router = APIRouter() @@ -47,6 +49,15 @@ TOOLS = [ "description": "Return the latest crawl run status.", "inputSchema": {"type": "object", "properties": {}}, }, + { + "name": "get_crawl_run_details", + "description": "Return detailed employee changes and errors for one crawl run.", + "inputSchema": { + "type": "object", + "properties": {"run_id": {"type": "integer"}}, + "required": ["run_id"], + }, + }, ] @@ -66,7 +77,7 @@ async def mcp_http( if method == "initialize": result = { "protocolVersion": "2024-11-05", - "serverInfo": {"name": "miem-employees", "version": "0.1.0"}, + "serverInfo": {"name": "miem-employees", "version": BACKEND_VERSION}, "capabilities": {"tools": {}}, } elif method == "tools/list": @@ -95,6 +106,9 @@ def _call_tool(db: Session, name: str, arguments: dict) -> dict: if name == "get_crawl_status": run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1)) return _tool_response(_run_payload(run) if run else {"status": "never_run"}) + if name == "get_crawl_run_details": + run = db.get(CrawlRun, int(arguments["run_id"])) + return _tool_response(run_detail_payload(db, run) if run else {"error": "not_found"}) raise ValueError(f"Unknown tool: {name}") diff --git a/app/models.py b/app/models.py index dd84c56..eeba774 100644 --- a/app/models.py +++ b/app/models.py @@ -41,6 +41,7 @@ class Employee(Base): snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee") tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan") + crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee") class EmployeeSnapshot(Base): @@ -74,6 +75,31 @@ class CrawlRun(Base): dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) message: Mapped[str | None] = mapped_column(Text) + employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run") + + +class CrawlRunEmployeeChange(Base): + __tablename__ = "crawl_run_employee_changes" + __table_args__ = ( + Index("ix_crawl_run_employee_changes_run_id", "crawl_run_id"), + Index("ix_crawl_run_employee_changes_employee_id", "employee_id"), + Index("ix_crawl_run_employee_changes_change_type", "change_type"), + ) + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False) + employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id")) + profile_key: Mapped[str] = mapped_column(String(255), nullable=False) + profile_url: Mapped[str] = mapped_column(Text, nullable=False) + full_name: Mapped[str | None] = mapped_column(Text) + change_type: Mapped[str] = mapped_column(String(32), nullable=False) + profile_available: Mapped[bool | None] = mapped_column() + message: Mapped[str | None] = mapped_column(Text) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) + + crawl_run: Mapped[CrawlRun] = relationship(back_populates="employee_changes") + employee: Mapped[Employee | None] = relationship(back_populates="crawl_run_changes") + class CrawlError(Base): __tablename__ = "crawl_errors" diff --git a/app/services/admin_data.py b/app/services/admin_data.py index 09fc9de..dc9fa82 100644 --- a/app/services/admin_data.py +++ b/app/services/admin_data.py @@ -8,7 +8,7 @@ from zoneinfo import ZoneInfo from sqlalchemy import Select, Text, and_, desc, func, or_, select from sqlalchemy.orm import Session -from app.models import CrawlRun, Employee +from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee EMPLOYEE_SORTS = { "full_name": Employee.full_name, @@ -175,6 +175,26 @@ def run_payload(run: CrawlRun | None) -> dict[str, Any] | None: } +def run_detail_payload(db: Session, run: CrawlRun | None) -> dict[str, Any] | None: + if not run: + return None + changes = db.scalars( + select(CrawlRunEmployeeChange) + .where(CrawlRunEmployeeChange.crawl_run_id == run.id) + .order_by(CrawlRunEmployeeChange.created_at, CrawlRunEmployeeChange.id) + ).all() + errors = db.scalars(select(CrawlError).where(CrawlError.crawl_run_id == run.id).order_by(CrawlError.created_at)).all() + grouped_changes = {"new": [], "missing_from_source": [], "dismissed": []} + for change in changes: + grouped_changes.setdefault(change.change_type, []).append(_change_payload(change)) + return { + **(run_payload(run) or {}), + "changes_detail_available": bool(changes), + "changes": grouped_changes, + "errors": [_crawl_error_payload(error) for error in errors], + } + + def format_admin_datetime(value: Any) -> str: if not value: return "Не указано" @@ -200,6 +220,52 @@ def _run_status_display(status: str | None) -> str: return labels.get(status or "", status or "Не указано") +def _change_payload(change: CrawlRunEmployeeChange) -> dict[str, Any]: + return { + "id": change.id, + "employee_id": change.employee_id, + "profile_key": change.profile_key, + "profile_url": change.profile_url, + "full_name": change.full_name, + "change_type": change.change_type, + "change_type_display": _change_type_display(change.change_type), + "profile_available": change.profile_available, + "profile_available_display": _profile_available_display(change.profile_available), + "message": change.message, + "created_at": change.created_at.isoformat() if change.created_at else None, + "created_display": format_admin_datetime(change.created_at), + } + + +def _crawl_error_payload(error: CrawlError) -> dict[str, Any]: + return { + "id": error.id, + "crawl_run_id": error.crawl_run_id, + "profile_url": error.profile_url, + "error_type": error.error_type, + "message": error.message, + "created_at": error.created_at.isoformat() if error.created_at else None, + "created_display": format_admin_datetime(error.created_at), + } + + +def _change_type_display(change_type: str | None) -> str: + labels = { + "new": "Новый", + "missing_from_source": "Потеряшка", + "dismissed": "Уволен", + } + return labels.get(change_type or "", change_type or "Не указано") + + +def _profile_available_display(value: bool | None) -> str: + if value is True: + return "Профиль доступен" + if value is False: + return "Профиль недоступен" + return "Не проверялось" + + def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int: total = 0 for section in sections: diff --git a/app/services/crawler.py b/app/services/crawler.py index 377a03e..840c5e7 100644 --- a/app/services/crawler.py +++ b/app/services/crawler.py @@ -9,7 +9,7 @@ from sqlalchemy import select from sqlalchemy.orm import Session from app.config import Settings -from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab +from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab from app.parser.collector import collect_profile_links from app.parser.profile import parse_person_profile from app.parser.profile_url import profile_key @@ -68,7 +68,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun: finally: time.sleep(settings.request_delay_seconds) - run.dismissed_count = _mark_dismissed(db, found_keys) + run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout) run.status = "completed" except Exception as exc: run.status = "failed" @@ -107,6 +107,9 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee: ) db.add(employee) run.new_count += 1 + is_new = True + else: + is_new = False employee.full_name = parsed.get("full_name") employee.status = "active" @@ -117,6 +120,16 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee: employee.current_checksum = checksum db.flush() + if is_new: + _record_employee_change( + db, + run, + employee, + "new", + profile_available=True, + message="Сотрудник впервые найден в источнике.", + ) + db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete() for tab in parsed.get("tabs") or []: db.add( @@ -141,20 +154,70 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee: return employee -def _mark_dismissed(db: Session, found_keys: set[str]) -> int: +def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int: dismissed = 0 active = db.scalars(select(Employee).where(Employee.status == "active")).all() now = datetime.now(timezone.utc) for employee in active: if employee.profile_key in found_keys: continue + profile_available = _profile_is_available(session, employee.canonical_url, timeout) + if profile_available: + _record_employee_change( + db, + run, + employee, + "missing_from_source", + profile_available=True, + message="Профиль доступен, но ссылка отсутствует в исходном списке.", + ) + continue employee.status = "dismissed" employee.dismissed_at = now + _record_employee_change( + db, + run, + employee, + "dismissed", + profile_available=False, + message="Сотрудник отсутствует в исходном списке, профиль не подтвердился как доступный.", + ) dismissed += 1 db.commit() return dismissed +def _profile_is_available(session: requests.Session, url: str, timeout: int) -> bool: + try: + response = session.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True) + return response.status_code < 400 + except requests.RequestException: + return False + + +def _record_employee_change( + db: Session, + run: CrawlRun, + employee: Employee, + change_type: str, + *, + profile_available: bool | None, + message: str, +) -> None: + db.add( + CrawlRunEmployeeChange( + crawl_run_id=run.id, + employee_id=employee.id, + profile_key=employee.profile_key, + profile_url=employee.canonical_url, + full_name=employee.full_name, + change_type=change_type, + profile_available=profile_available, + message=message, + ) + ) + + def _checksum(data: dict) -> str: payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":")) return hashlib.sha256(payload.encode("utf-8")).hexdigest() diff --git a/app/templates/run_detail.html b/app/templates/run_detail.html new file mode 100644 index 0000000..6b7d045 --- /dev/null +++ b/app/templates/run_detail.html @@ -0,0 +1,64 @@ +{% extends "base.html" %} +{% block title %}Запуск {{ run.id }} · MIEM Employees{% endblock %} +{% block content %} +
+
+
+

Запуск {{ run.id }}

+

{{ run.started_display }} · {{ run.status_display }}

+
+ Все запуски +
+
+
Найдено{{ run.found_count }}
+
Обработано{{ run.parsed_count }}
+
Новые{{ run.new_count }}
+
Потеряшки{{ run.changes.missing_from_source | length }}
+
Уволены{{ run.dismissed_count }}
+
Ошибки{{ run.error_count }}
+
+ {% if not run.changes_detail_available %} +

Детализация сотрудников для этого запуска недоступна. Она сохраняется только для новых запусков после обновления.

+ {% endif %} +
+ +{% for group, title in [("new", "Новые сотрудники"), ("missing_from_source", "Потеряшки"), ("dismissed", "Уволенные")] %} +
+

{{ title }}

+ {% set items = run.changes[group] %} + {% if items %} + + + + {% for item in items %} + + + + + + + {% endfor %} + +
ФИОПрофильПроверкаКомментарий
{% if item.employee_id %}{{ item.full_name or item.profile_key }}{% else %}{{ item.full_name or item.profile_key }}{% endif %}{{ item.profile_url }}{{ item.profile_available_display }}{{ item.message or "" }}
+ {% else %} +

Нет записей.

+ {% endif %} +
+{% endfor %} + +
+

Ошибки запуска

+ {% if run.errors %} + + + + {% for error in run.errors %} + + {% endfor %} + +
ПрофильОшибкаВремя
{{ error.profile_url or "" }}{{ error.error_type }}: {{ error.message }}{{ error.created_display }}
+ {% else %} +

Ошибок нет.

+ {% endif %} +
+{% endblock %} diff --git a/app/templates/runs.html b/app/templates/runs.html index bd86cd2..f79683d 100644 --- a/app/templates/runs.html +++ b/app/templates/runs.html @@ -38,7 +38,7 @@ IDСтатусНайденоОбработаноНовыеОшибкиУволеныСтарт {% for run in runs %} - {{ run.id }}{{ run.status_display }}{{ run.found_count }}{{ run.parsed_count }}{{ run.new_count }}{{ run.error_count }}{{ run.dismissed_count }}{{ run.started_display }} + {{ run.id }}{{ run.status_display }}{{ run.found_count }}{{ run.parsed_count }}{{ run.new_count }}{{ run.error_count }}{{ run.dismissed_count }}{{ run.started_display }} {% endfor %} diff --git a/app/version.py b/app/version.py index a9d2612..695034a 100644 --- a/app/version.py +++ b/app/version.py @@ -1,3 +1,3 @@ -APP_VERSION = "0.3.0" -FRONTEND_VERSION = "0.3.0" -BACKEND_VERSION = "0.3.0" +APP_VERSION = "0.4.0" +FRONTEND_VERSION = "0.4.0" +BACKEND_VERSION = "0.4.0" diff --git a/migrations/003_crawl_run_employee_changes.sql b/migrations/003_crawl_run_employee_changes.sql new file mode 100644 index 0000000..4d9e7f6 --- /dev/null +++ b/migrations/003_crawl_run_employee_changes.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS crawl_run_employee_changes ( + id SERIAL PRIMARY KEY, + crawl_run_id INTEGER NOT NULL REFERENCES crawl_runs(id), + employee_id INTEGER REFERENCES employees(id), + profile_key VARCHAR(255) NOT NULL, + profile_url TEXT NOT NULL, + full_name TEXT, + change_type VARCHAR(32) NOT NULL, + profile_available BOOLEAN, + message TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_run_id + ON crawl_run_employee_changes (crawl_run_id); + +CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_employee_id + ON crawl_run_employee_changes (employee_id); + +CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_change_type + ON crawl_run_employee_changes (change_type); diff --git a/pyproject.toml b/pyproject.toml index 6c46484..5e158e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "miem-workers" -version = "0.3.0" +version = "0.4.0" description = "MIEM employees parser, admin API, and MCP server" requires-python = ">=3.11" dependencies = [ diff --git a/tests/test_admin_data.py b/tests/test_admin_data.py index 93f7fb2..6f636ad 100644 --- a/tests/test_admin_data.py +++ b/tests/test_admin_data.py @@ -1,11 +1,12 @@ from datetime import datetime, timezone -from app.models import CrawlRun, Employee +from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee from app.services.admin_data import ( employee_detail_payload, employee_display_payload, format_admin_datetime, list_employees_page, + run_detail_payload, run_payload, stats_payload, ) @@ -207,3 +208,43 @@ def test_run_payload_calculates_progress(): assert payload["processed_count"] == 5 assert payload["progress_percent"] == 50.0 assert payload["status_display"] == "Выполняется" + + +def test_run_detail_payload_groups_changes_and_handles_old_runs(db_session): + old_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed") + run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1) + employee = Employee( + profile_key="staff:new", + canonical_url="https://www.hse.ru/staff/new", + full_name="New Person", + status="active", + first_seen_at=datetime.now(timezone.utc), + last_seen_at=datetime.now(timezone.utc), + ) + db_session.add_all([old_run, run, employee]) + db_session.commit() + db_session.add( + CrawlRunEmployeeChange( + crawl_run_id=run.id, + employee_id=employee.id, + profile_key=employee.profile_key, + profile_url=employee.canonical_url, + full_name=employee.full_name, + change_type="new", + profile_available=True, + message="added", + ) + ) + db_session.add( + CrawlError(crawl_run_id=run.id, profile_url=employee.canonical_url, error_type="ValueError", message="bad") + ) + db_session.commit() + + payload = run_detail_payload(db_session, run) + old_payload = run_detail_payload(db_session, old_run) + + assert payload["changes_detail_available"] is True + assert payload["changes"]["new"][0]["full_name"] == "New Person" + assert payload["errors"][0]["error_type"] == "ValueError" + assert old_payload["changes_detail_available"] is False + assert old_payload["changes"]["new"] == [] diff --git a/tests/test_admin_templates.py b/tests/test_admin_templates.py index f1b9fe5..b711282 100644 --- a/tests/test_admin_templates.py +++ b/tests/test_admin_templates.py @@ -32,3 +32,19 @@ def test_admin_employees_route_redirects_to_directory(): source = Path("app/admin.py").read_text(encoding="utf-8") assert 'RedirectResponse("/admin/directory", status_code=303)' in source + + +def test_runs_template_links_to_run_detail(): + template = Path("app/templates/runs.html").read_text(encoding="utf-8") + + assert 'href="/admin/runs/{{ run.id }}"' in template + + +def test_run_detail_template_extends_base_and_shows_change_groups(): + template = Path("app/templates/run_detail.html").read_text(encoding="utf-8") + + assert '{% extends "base.html" %}' in template + assert "Новые сотрудники" in template + assert "Потеряшки" in template + assert "Уволенные" in template + assert "Детализация сотрудников для этого запуска недоступна" in template diff --git a/tests/test_api_mcp.py b/tests/test_api_mcp.py index 5842d73..9bb867b 100644 --- a/tests/test_api_mcp.py +++ b/tests/test_api_mcp.py @@ -13,7 +13,7 @@ import app.security as security from app.config import Settings, get_settings from app.db import Base, get_db from app.main import app -from app.models import CrawlRun, Employee +from app.models import CrawlRun, CrawlRunEmployeeChange, Employee from app.security import SESSION_COOKIE, sign_session @@ -23,7 +23,7 @@ def test_health_returns_versions(): response = client.get("/api/health") assert response.status_code == 200 - assert response.json()["backend_version"] == "0.3.0" + assert response.json()["backend_version"] == "0.4.0" def test_mcp_requires_token_and_lists_tools(): @@ -58,6 +58,7 @@ def test_mcp_requires_token_and_lists_tools(): assert unauthorized.status_code == 401 assert authorized.status_code == 200 assert authorized.json()["result"]["tools"][0]["name"] == "search_employees" + assert any(tool["name"] == "get_crawl_run_details" for tool in authorized.json()["result"]["tools"]) app.dependency_overrides.clear() @@ -117,6 +118,76 @@ def test_mcp_search_employees_returns_matching_employee(): app.dependency_overrides.clear() +def test_mcp_get_crawl_run_details_returns_changes(): + engine = create_engine( + "sqlite:///:memory:", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + Base.metadata.create_all(engine) + Session = sessionmaker(bind=engine) + session = Session() + run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1) + employee = Employee( + profile_key="staff:new", + profile_type="staff", + profile_id="new", + canonical_url="https://www.hse.ru/staff/new", + full_name="New Person", + status="active", + first_seen_at=datetime.now(timezone.utc), + last_seen_at=datetime.now(timezone.utc), + ) + session.add_all([run, employee]) + session.commit() + session.add( + CrawlRunEmployeeChange( + crawl_run_id=run.id, + employee_id=employee.id, + profile_key=employee.profile_key, + profile_url=employee.canonical_url, + full_name=employee.full_name, + change_type="new", + profile_available=True, + message="added", + ) + ) + session.commit() + run_id = run.id + session.close() + + def override_db(): + db = Session() + try: + yield db + finally: + db.close() + + app.dependency_overrides[get_db] = override_db + app.dependency_overrides[get_settings] = lambda: Settings( + mcp_auth_mode="token", mcp_token="secret", session_secret="session-secret" + ) + client = TestClient(app) + + response = client.post( + "/mcp", + headers={"Authorization": "Bearer secret"}, + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": {"name": "get_crawl_run_details", "arguments": {"run_id": run_id}}, + }, + ) + + assert response.status_code == 200 + text = response.json()["result"]["content"][0]["text"] + assert "New Person" in text + assert "changes_detail_available" in text + + app.dependency_overrides.clear() + + def test_mcp_oauth_rejects_static_token(): engine = create_engine( "sqlite:///:memory:", @@ -281,8 +352,23 @@ def test_api_employees_and_stats_require_admin_session(): current_data={"contacts": {"emails": ["alpha@hse.ru"]}, "sections": []}, ) ) - db.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)) + run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1) + db.add(run) db.commit() + db.add( + CrawlRunEmployeeChange( + crawl_run_id=run.id, + employee_id=1, + profile_key="staff:alpha", + profile_url="https://www.hse.ru/staff/alpha", + full_name="Alpha Person", + change_type="new", + profile_available=True, + message="added", + ) + ) + db.commit() + run_id = run.id db.close() settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret") @@ -301,11 +387,14 @@ def test_api_employees_and_stats_require_admin_session(): employees = client.get("/api/employees", params={"q": "Alpha", "has_email": True}) stats = client.get("/api/stats") + run_details = client.get(f"/api/crawl-runs/{run_id}") assert employees.status_code == 200 assert employees.json()["total"] == 1 assert stats.status_code == 200 assert stats.json()["new_in_last_run"] == 1 + assert run_details.status_code == 200 + assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person" app.dependency_overrides.clear() diff --git a/tests/test_crawler.py b/tests/test_crawler.py index f742e6f..fa9b674 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,10 +1,25 @@ from datetime import datetime, timezone -from app.models import CrawlRun, Employee +from app.models import CrawlRun, CrawlRunEmployeeChange, Employee from app.services.crawler import _mark_dismissed, _upsert_employee -def test_mark_dismissed_only_marks_missing_active_employees(db_session): +class FakeResponse: + def __init__(self, status_code): + self.status_code = status_code + + +class FakeSession: + def __init__(self, statuses): + self.statuses = statuses + + def get(self, url, **_kwargs): + return FakeResponse(self.statuses[url]) + + +def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session): + run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") + db_session.add(run) db_session.add( Employee( profile_key="staff:kept", @@ -16,8 +31,8 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session): ) db_session.add( Employee( - profile_key="staff:gone", - canonical_url="https://www.hse.ru/staff/gone", + profile_key="staff:missing", + canonical_url="https://www.hse.ru/staff/missing", status="active", first_seen_at=datetime.now(timezone.utc), last_seen_at=datetime.now(timezone.utc), @@ -25,16 +40,53 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session): ) db_session.commit() - dismissed = _mark_dismissed(db_session, {"staff:kept"}) + dismissed = _mark_dismissed( + db_session, + run, + {"staff:kept"}, + FakeSession({"https://www.hse.ru/staff/missing": 200}), + 30, + ) + + assert dismissed == 0 + assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active" + missing = db_session.query(Employee).filter_by(profile_key="staff:missing").one() + assert missing.status == "active" + assert missing.dismissed_at is None + change = db_session.query(CrawlRunEmployeeChange).one() + assert change.change_type == "missing_from_source" + assert change.profile_available is True + + +def test_mark_dismissed_marks_missing_employee_when_profile_is_unavailable(db_session): + run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") + employee = Employee( + profile_key="staff:gone", + canonical_url="https://www.hse.ru/staff/gone", + status="active", + first_seen_at=datetime.now(timezone.utc), + last_seen_at=datetime.now(timezone.utc), + ) + db_session.add_all([run, employee]) + db_session.commit() + + dismissed = _mark_dismissed( + db_session, + run, + set(), + FakeSession({"https://www.hse.ru/staff/gone": 404}), + 30, + ) assert dismissed == 1 - assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active" - gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one() - assert gone.status == "dismissed" - assert gone.dismissed_at is not None + assert employee.status == "dismissed" + assert employee.dismissed_at is not None + change = db_session.query(CrawlRunEmployeeChange).one() + assert change.change_type == "dismissed" + assert change.profile_available is False -def test_upsert_employee_increments_new_count_for_new_employee(db_session): +def test_upsert_employee_increments_new_count_and_records_change_for_new_employee(db_session): run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") db_session.add(run) db_session.commit() @@ -56,3 +108,6 @@ def test_upsert_employee_increments_new_count_for_new_employee(db_session): db_session.commit() assert run.new_count == 1 + change = db_session.query(CrawlRunEmployeeChange).one() + assert change.change_type == "new" + assert change.full_name == "New Person"