diff --git a/README.md b/README.md
index de0a8d9..d6b61c7 100644
--- a/README.md
+++ b/README.md
@@ -131,4 +131,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql
docker compose down
```
-Версия сервиса: `0.3.0`. Админка всегда показывает версии backend и frontend в footer.
+Версия сервиса: `0.4.0`. Админка всегда показывает версии backend и frontend в footer.
diff --git a/app/admin.py b/app/admin.py
index 5fba191..0348ff4 100644
--- a/app/admin.py
+++ b/app/admin.py
@@ -8,7 +8,14 @@ from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlError, CrawlRun, Employee
from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
-from app.services.admin_data import employee_detail_payload, format_admin_datetime, list_employees_page, run_payload, stats_payload
+from app.services.admin_data import (
+ employee_detail_payload,
+ format_admin_datetime,
+ list_employees_page,
+ run_detail_payload,
+ run_payload,
+ stats_payload,
+)
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
@@ -150,6 +157,20 @@ def runs(request: Request, db: Session = Depends(get_db), settings: Settings = D
return _render(request, "runs.html", {"runs": items, "errors": errors})
+@router.get("/runs/{run_id}", response_class=HTMLResponse)
+def run_detail(
+ run_id: int,
+ request: Request,
+ db: Session = Depends(get_db),
+ settings: Settings = Depends(get_settings),
+):
+ require_admin(request, settings)
+ run = db.get(CrawlRun, run_id)
+ if not run:
+ return RedirectResponse("/admin/runs", status_code=303)
+ return _render(request, "run_detail.html", {"run": run_detail_payload(db, run)})
+
+
@router.post("/runs")
def trigger_run(
request: Request,
diff --git a/app/api.py b/app/api.py
index 12f3a78..37b65fa 100644
--- a/app/api.py
+++ b/app/api.py
@@ -8,7 +8,7 @@ from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlRun, Employee
from app.security import require_admin
-from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
+from app.services.admin_data import employee_display_payload, list_employees_page, run_detail_payload, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
@@ -88,6 +88,20 @@ def latest_crawl_run(
return {"running": run_payload(running), "latest": run_payload(latest)}
+@router.get("/crawl-runs/{run_id}")
+def get_crawl_run(
+ run_id: int,
+ request: Request,
+ db: Session = Depends(get_db),
+ settings: Settings = Depends(get_settings),
+) -> dict:
+ require_admin(request, settings)
+ run = db.get(CrawlRun, run_id)
+ if not run:
+ return {"error": "not_found"}
+ return run_detail_payload(db, run) or {"error": "not_found"}
+
+
@router.post("/crawl-runs")
def trigger_crawl(
request: Request,
diff --git a/app/mcp.py b/app/mcp.py
index ed5a311..1725b35 100644
--- a/app/mcp.py
+++ b/app/mcp.py
@@ -8,6 +8,8 @@ from app.config import Settings, get_settings
from app.db import get_db
from app.models import CrawlRun, Employee
from app.security import mcp_protected_resource_metadata, require_mcp_auth
+from app.services.admin_data import run_detail_payload
+from app.version import BACKEND_VERSION
router = APIRouter(prefix="/mcp")
metadata_router = APIRouter()
@@ -47,6 +49,15 @@ TOOLS = [
"description": "Return the latest crawl run status.",
"inputSchema": {"type": "object", "properties": {}},
},
+ {
+ "name": "get_crawl_run_details",
+ "description": "Return detailed employee changes and errors for one crawl run.",
+ "inputSchema": {
+ "type": "object",
+ "properties": {"run_id": {"type": "integer"}},
+ "required": ["run_id"],
+ },
+ },
]
@@ -66,7 +77,7 @@ async def mcp_http(
if method == "initialize":
result = {
"protocolVersion": "2024-11-05",
- "serverInfo": {"name": "miem-employees", "version": "0.1.0"},
+ "serverInfo": {"name": "miem-employees", "version": BACKEND_VERSION},
"capabilities": {"tools": {}},
}
elif method == "tools/list":
@@ -95,6 +106,9 @@ def _call_tool(db: Session, name: str, arguments: dict) -> dict:
if name == "get_crawl_status":
run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return _tool_response(_run_payload(run) if run else {"status": "never_run"})
+ if name == "get_crawl_run_details":
+ run = db.get(CrawlRun, int(arguments["run_id"]))
+ return _tool_response(run_detail_payload(db, run) if run else {"error": "not_found"})
raise ValueError(f"Unknown tool: {name}")
diff --git a/app/models.py b/app/models.py
index dd84c56..eeba774 100644
--- a/app/models.py
+++ b/app/models.py
@@ -41,6 +41,7 @@ class Employee(Base):
snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
+ crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee")
class EmployeeSnapshot(Base):
@@ -74,6 +75,31 @@ class CrawlRun(Base):
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
message: Mapped[str | None] = mapped_column(Text)
+ employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run")
+
+
+class CrawlRunEmployeeChange(Base):
+ __tablename__ = "crawl_run_employee_changes"
+ __table_args__ = (
+ Index("ix_crawl_run_employee_changes_run_id", "crawl_run_id"),
+ Index("ix_crawl_run_employee_changes_employee_id", "employee_id"),
+ Index("ix_crawl_run_employee_changes_change_type", "change_type"),
+ )
+
+ id: Mapped[int] = mapped_column(Integer, primary_key=True)
+ crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
+ employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
+ profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
+ profile_url: Mapped[str] = mapped_column(Text, nullable=False)
+ full_name: Mapped[str | None] = mapped_column(Text)
+ change_type: Mapped[str] = mapped_column(String(32), nullable=False)
+ profile_available: Mapped[bool | None] = mapped_column()
+ message: Mapped[str | None] = mapped_column(Text)
+ created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
+
+ crawl_run: Mapped[CrawlRun] = relationship(back_populates="employee_changes")
+ employee: Mapped[Employee | None] = relationship(back_populates="crawl_run_changes")
+
class CrawlError(Base):
__tablename__ = "crawl_errors"
diff --git a/app/services/admin_data.py b/app/services/admin_data.py
index 09fc9de..dc9fa82 100644
--- a/app/services/admin_data.py
+++ b/app/services/admin_data.py
@@ -8,7 +8,7 @@ from zoneinfo import ZoneInfo
from sqlalchemy import Select, Text, and_, desc, func, or_, select
from sqlalchemy.orm import Session
-from app.models import CrawlRun, Employee
+from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
EMPLOYEE_SORTS = {
"full_name": Employee.full_name,
@@ -175,6 +175,26 @@ def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
}
+def run_detail_payload(db: Session, run: CrawlRun | None) -> dict[str, Any] | None:
+ if not run:
+ return None
+ changes = db.scalars(
+ select(CrawlRunEmployeeChange)
+ .where(CrawlRunEmployeeChange.crawl_run_id == run.id)
+ .order_by(CrawlRunEmployeeChange.created_at, CrawlRunEmployeeChange.id)
+ ).all()
+ errors = db.scalars(select(CrawlError).where(CrawlError.crawl_run_id == run.id).order_by(CrawlError.created_at)).all()
+ grouped_changes = {"new": [], "missing_from_source": [], "dismissed": []}
+ for change in changes:
+ grouped_changes.setdefault(change.change_type, []).append(_change_payload(change))
+ return {
+ **(run_payload(run) or {}),
+ "changes_detail_available": bool(changes),
+ "changes": grouped_changes,
+ "errors": [_crawl_error_payload(error) for error in errors],
+ }
+
+
def format_admin_datetime(value: Any) -> str:
if not value:
return "Не указано"
@@ -200,6 +220,52 @@ def _run_status_display(status: str | None) -> str:
return labels.get(status or "", status or "Не указано")
+def _change_payload(change: CrawlRunEmployeeChange) -> dict[str, Any]:
+ return {
+ "id": change.id,
+ "employee_id": change.employee_id,
+ "profile_key": change.profile_key,
+ "profile_url": change.profile_url,
+ "full_name": change.full_name,
+ "change_type": change.change_type,
+ "change_type_display": _change_type_display(change.change_type),
+ "profile_available": change.profile_available,
+ "profile_available_display": _profile_available_display(change.profile_available),
+ "message": change.message,
+ "created_at": change.created_at.isoformat() if change.created_at else None,
+ "created_display": format_admin_datetime(change.created_at),
+ }
+
+
+def _crawl_error_payload(error: CrawlError) -> dict[str, Any]:
+ return {
+ "id": error.id,
+ "crawl_run_id": error.crawl_run_id,
+ "profile_url": error.profile_url,
+ "error_type": error.error_type,
+ "message": error.message,
+ "created_at": error.created_at.isoformat() if error.created_at else None,
+ "created_display": format_admin_datetime(error.created_at),
+ }
+
+
+def _change_type_display(change_type: str | None) -> str:
+ labels = {
+ "new": "Новый",
+ "missing_from_source": "Потеряшка",
+ "dismissed": "Уволен",
+ }
+ return labels.get(change_type or "", change_type or "Не указано")
+
+
+def _profile_available_display(value: bool | None) -> str:
+ if value is True:
+ return "Профиль доступен"
+ if value is False:
+ return "Профиль недоступен"
+ return "Не проверялось"
+
+
def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
total = 0
for section in sections:
diff --git a/app/services/crawler.py b/app/services/crawler.py
index 377a03e..840c5e7 100644
--- a/app/services/crawler.py
+++ b/app/services/crawler.py
@@ -9,7 +9,7 @@ from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import Settings
-from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab
+from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key
@@ -68,7 +68,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
finally:
time.sleep(settings.request_delay_seconds)
- run.dismissed_count = _mark_dismissed(db, found_keys)
+ run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.status = "completed"
except Exception as exc:
run.status = "failed"
@@ -107,6 +107,9 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
)
db.add(employee)
run.new_count += 1
+ is_new = True
+ else:
+ is_new = False
employee.full_name = parsed.get("full_name")
employee.status = "active"
@@ -117,6 +120,16 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
employee.current_checksum = checksum
db.flush()
+ if is_new:
+ _record_employee_change(
+ db,
+ run,
+ employee,
+ "new",
+ profile_available=True,
+ message="Сотрудник впервые найден в источнике.",
+ )
+
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
db.add(
@@ -141,20 +154,70 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
return employee
-def _mark_dismissed(db: Session, found_keys: set[str]) -> int:
+def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
dismissed = 0
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
now = datetime.now(timezone.utc)
for employee in active:
if employee.profile_key in found_keys:
continue
+ profile_available = _profile_is_available(session, employee.canonical_url, timeout)
+ if profile_available:
+ _record_employee_change(
+ db,
+ run,
+ employee,
+ "missing_from_source",
+ profile_available=True,
+ message="Профиль доступен, но ссылка отсутствует в исходном списке.",
+ )
+ continue
employee.status = "dismissed"
employee.dismissed_at = now
+ _record_employee_change(
+ db,
+ run,
+ employee,
+ "dismissed",
+ profile_available=False,
+ message="Сотрудник отсутствует в исходном списке, профиль не подтвердился как доступный.",
+ )
dismissed += 1
db.commit()
return dismissed
+def _profile_is_available(session: requests.Session, url: str, timeout: int) -> bool:
+ try:
+ response = session.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
+ return response.status_code < 400
+ except requests.RequestException:
+ return False
+
+
+def _record_employee_change(
+ db: Session,
+ run: CrawlRun,
+ employee: Employee,
+ change_type: str,
+ *,
+ profile_available: bool | None,
+ message: str,
+) -> None:
+ db.add(
+ CrawlRunEmployeeChange(
+ crawl_run_id=run.id,
+ employee_id=employee.id,
+ profile_key=employee.profile_key,
+ profile_url=employee.canonical_url,
+ full_name=employee.full_name,
+ change_type=change_type,
+ profile_available=profile_available,
+ message=message,
+ )
+ )
+
+
def _checksum(data: dict) -> str:
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
diff --git a/app/templates/run_detail.html b/app/templates/run_detail.html
new file mode 100644
index 0000000..6b7d045
--- /dev/null
+++ b/app/templates/run_detail.html
@@ -0,0 +1,64 @@
+{% extends "base.html" %}
+{% block title %}Запуск {{ run.id }} · MIEM Employees{% endblock %}
+{% block content %}
+
+
+
+
Найдено{{ run.found_count }}
+
Обработано{{ run.parsed_count }}
+
Новые{{ run.new_count }}
+
Потеряшки{{ run.changes.missing_from_source | length }}
+
Уволены{{ run.dismissed_count }}
+
Ошибки{{ run.error_count }}
+
+ {% if not run.changes_detail_available %}
+ Детализация сотрудников для этого запуска недоступна. Она сохраняется только для новых запусков после обновления.
+ {% endif %}
+
+
+{% for group, title in [("new", "Новые сотрудники"), ("missing_from_source", "Потеряшки"), ("dismissed", "Уволенные")] %}
+
+ {{ title }}
+ {% set items = run.changes[group] %}
+ {% if items %}
+
+ {% else %}
+ Нет записей.
+ {% endif %}
+
+{% endfor %}
+
+
+ Ошибки запуска
+ {% if run.errors %}
+
+ | Профиль | Ошибка | Время |
+
+ {% for error in run.errors %}
+ | {{ error.profile_url or "" }} | {{ error.error_type }}: {{ error.message }} | {{ error.created_display }} |
+ {% endfor %}
+
+
+ {% else %}
+ Ошибок нет.
+ {% endif %}
+
+{% endblock %}
diff --git a/app/templates/runs.html b/app/templates/runs.html
index bd86cd2..f79683d 100644
--- a/app/templates/runs.html
+++ b/app/templates/runs.html
@@ -38,7 +38,7 @@
| ID | Статус | Найдено | Обработано | Новые | Ошибки | Уволены | Старт |
{% for run in runs %}
- | {{ run.id }} | {{ run.status_display }} | {{ run.found_count }} | {{ run.parsed_count }} | {{ run.new_count }} | {{ run.error_count }} | {{ run.dismissed_count }} | {{ run.started_display }} |
+ | {{ run.id }} | {{ run.status_display }} | {{ run.found_count }} | {{ run.parsed_count }} | {{ run.new_count }} | {{ run.error_count }} | {{ run.dismissed_count }} | {{ run.started_display }} |
{% endfor %}
diff --git a/app/version.py b/app/version.py
index a9d2612..695034a 100644
--- a/app/version.py
+++ b/app/version.py
@@ -1,3 +1,3 @@
-APP_VERSION = "0.3.0"
-FRONTEND_VERSION = "0.3.0"
-BACKEND_VERSION = "0.3.0"
+APP_VERSION = "0.4.0"
+FRONTEND_VERSION = "0.4.0"
+BACKEND_VERSION = "0.4.0"
diff --git a/migrations/003_crawl_run_employee_changes.sql b/migrations/003_crawl_run_employee_changes.sql
new file mode 100644
index 0000000..4d9e7f6
--- /dev/null
+++ b/migrations/003_crawl_run_employee_changes.sql
@@ -0,0 +1,21 @@
+CREATE TABLE IF NOT EXISTS crawl_run_employee_changes (
+ id SERIAL PRIMARY KEY,
+ crawl_run_id INTEGER NOT NULL REFERENCES crawl_runs(id),
+ employee_id INTEGER REFERENCES employees(id),
+ profile_key VARCHAR(255) NOT NULL,
+ profile_url TEXT NOT NULL,
+ full_name TEXT,
+ change_type VARCHAR(32) NOT NULL,
+ profile_available BOOLEAN,
+ message TEXT,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_run_id
+ ON crawl_run_employee_changes (crawl_run_id);
+
+CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_employee_id
+ ON crawl_run_employee_changes (employee_id);
+
+CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_change_type
+ ON crawl_run_employee_changes (change_type);
diff --git a/pyproject.toml b/pyproject.toml
index 6c46484..5e158e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "miem-workers"
-version = "0.3.0"
+version = "0.4.0"
description = "MIEM employees parser, admin API, and MCP server"
requires-python = ">=3.11"
dependencies = [
diff --git a/tests/test_admin_data.py b/tests/test_admin_data.py
index 93f7fb2..6f636ad 100644
--- a/tests/test_admin_data.py
+++ b/tests/test_admin_data.py
@@ -1,11 +1,12 @@
from datetime import datetime, timezone
-from app.models import CrawlRun, Employee
+from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
from app.services.admin_data import (
employee_detail_payload,
employee_display_payload,
format_admin_datetime,
list_employees_page,
+ run_detail_payload,
run_payload,
stats_payload,
)
@@ -207,3 +208,43 @@ def test_run_payload_calculates_progress():
assert payload["processed_count"] == 5
assert payload["progress_percent"] == 50.0
assert payload["status_display"] == "Выполняется"
+
+
+def test_run_detail_payload_groups_changes_and_handles_old_runs(db_session):
+ old_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed")
+ run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
+ employee = Employee(
+ profile_key="staff:new",
+ canonical_url="https://www.hse.ru/staff/new",
+ full_name="New Person",
+ status="active",
+ first_seen_at=datetime.now(timezone.utc),
+ last_seen_at=datetime.now(timezone.utc),
+ )
+ db_session.add_all([old_run, run, employee])
+ db_session.commit()
+ db_session.add(
+ CrawlRunEmployeeChange(
+ crawl_run_id=run.id,
+ employee_id=employee.id,
+ profile_key=employee.profile_key,
+ profile_url=employee.canonical_url,
+ full_name=employee.full_name,
+ change_type="new",
+ profile_available=True,
+ message="added",
+ )
+ )
+ db_session.add(
+ CrawlError(crawl_run_id=run.id, profile_url=employee.canonical_url, error_type="ValueError", message="bad")
+ )
+ db_session.commit()
+
+ payload = run_detail_payload(db_session, run)
+ old_payload = run_detail_payload(db_session, old_run)
+
+ assert payload["changes_detail_available"] is True
+ assert payload["changes"]["new"][0]["full_name"] == "New Person"
+ assert payload["errors"][0]["error_type"] == "ValueError"
+ assert old_payload["changes_detail_available"] is False
+ assert old_payload["changes"]["new"] == []
diff --git a/tests/test_admin_templates.py b/tests/test_admin_templates.py
index f1b9fe5..b711282 100644
--- a/tests/test_admin_templates.py
+++ b/tests/test_admin_templates.py
@@ -32,3 +32,19 @@ def test_admin_employees_route_redirects_to_directory():
source = Path("app/admin.py").read_text(encoding="utf-8")
assert 'RedirectResponse("/admin/directory", status_code=303)' in source
+
+
+def test_runs_template_links_to_run_detail():
+ template = Path("app/templates/runs.html").read_text(encoding="utf-8")
+
+ assert 'href="/admin/runs/{{ run.id }}"' in template
+
+
+def test_run_detail_template_extends_base_and_shows_change_groups():
+ template = Path("app/templates/run_detail.html").read_text(encoding="utf-8")
+
+ assert '{% extends "base.html" %}' in template
+ assert "Новые сотрудники" in template
+ assert "Потеряшки" in template
+ assert "Уволенные" in template
+ assert "Детализация сотрудников для этого запуска недоступна" in template
diff --git a/tests/test_api_mcp.py b/tests/test_api_mcp.py
index 5842d73..9bb867b 100644
--- a/tests/test_api_mcp.py
+++ b/tests/test_api_mcp.py
@@ -13,7 +13,7 @@ import app.security as security
from app.config import Settings, get_settings
from app.db import Base, get_db
from app.main import app
-from app.models import CrawlRun, Employee
+from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
from app.security import SESSION_COOKIE, sign_session
@@ -23,7 +23,7 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
- assert response.json()["backend_version"] == "0.3.0"
+ assert response.json()["backend_version"] == "0.4.0"
def test_mcp_requires_token_and_lists_tools():
@@ -58,6 +58,7 @@ def test_mcp_requires_token_and_lists_tools():
assert unauthorized.status_code == 401
assert authorized.status_code == 200
assert authorized.json()["result"]["tools"][0]["name"] == "search_employees"
+ assert any(tool["name"] == "get_crawl_run_details" for tool in authorized.json()["result"]["tools"])
app.dependency_overrides.clear()
@@ -117,6 +118,76 @@ def test_mcp_search_employees_returns_matching_employee():
app.dependency_overrides.clear()
+def test_mcp_get_crawl_run_details_returns_changes():
+ engine = create_engine(
+ "sqlite:///:memory:",
+ connect_args={"check_same_thread": False},
+ poolclass=StaticPool,
+ )
+ Base.metadata.create_all(engine)
+ Session = sessionmaker(bind=engine)
+ session = Session()
+ run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
+ employee = Employee(
+ profile_key="staff:new",
+ profile_type="staff",
+ profile_id="new",
+ canonical_url="https://www.hse.ru/staff/new",
+ full_name="New Person",
+ status="active",
+ first_seen_at=datetime.now(timezone.utc),
+ last_seen_at=datetime.now(timezone.utc),
+ )
+ session.add_all([run, employee])
+ session.commit()
+ session.add(
+ CrawlRunEmployeeChange(
+ crawl_run_id=run.id,
+ employee_id=employee.id,
+ profile_key=employee.profile_key,
+ profile_url=employee.canonical_url,
+ full_name=employee.full_name,
+ change_type="new",
+ profile_available=True,
+ message="added",
+ )
+ )
+ session.commit()
+ run_id = run.id
+ session.close()
+
+ def override_db():
+ db = Session()
+ try:
+ yield db
+ finally:
+ db.close()
+
+ app.dependency_overrides[get_db] = override_db
+ app.dependency_overrides[get_settings] = lambda: Settings(
+ mcp_auth_mode="token", mcp_token="secret", session_secret="session-secret"
+ )
+ client = TestClient(app)
+
+ response = client.post(
+ "/mcp",
+ headers={"Authorization": "Bearer secret"},
+ json={
+ "jsonrpc": "2.0",
+ "id": 1,
+ "method": "tools/call",
+ "params": {"name": "get_crawl_run_details", "arguments": {"run_id": run_id}},
+ },
+ )
+
+ assert response.status_code == 200
+ text = response.json()["result"]["content"][0]["text"]
+ assert "New Person" in text
+ assert "changes_detail_available" in text
+
+ app.dependency_overrides.clear()
+
+
def test_mcp_oauth_rejects_static_token():
engine = create_engine(
"sqlite:///:memory:",
@@ -281,8 +352,23 @@ def test_api_employees_and_stats_require_admin_session():
current_data={"contacts": {"emails": ["alpha@hse.ru"]}, "sections": []},
)
)
- db.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1))
+ run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
+ db.add(run)
db.commit()
+ db.add(
+ CrawlRunEmployeeChange(
+ crawl_run_id=run.id,
+ employee_id=1,
+ profile_key="staff:alpha",
+ profile_url="https://www.hse.ru/staff/alpha",
+ full_name="Alpha Person",
+ change_type="new",
+ profile_available=True,
+ message="added",
+ )
+ )
+ db.commit()
+ run_id = run.id
db.close()
settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
@@ -301,11 +387,14 @@ def test_api_employees_and_stats_require_admin_session():
employees = client.get("/api/employees", params={"q": "Alpha", "has_email": True})
stats = client.get("/api/stats")
+ run_details = client.get(f"/api/crawl-runs/{run_id}")
assert employees.status_code == 200
assert employees.json()["total"] == 1
assert stats.status_code == 200
assert stats.json()["new_in_last_run"] == 1
+ assert run_details.status_code == 200
+ assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person"
app.dependency_overrides.clear()
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
index f742e6f..fa9b674 100644
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -1,10 +1,25 @@
from datetime import datetime, timezone
-from app.models import CrawlRun, Employee
+from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
from app.services.crawler import _mark_dismissed, _upsert_employee
-def test_mark_dismissed_only_marks_missing_active_employees(db_session):
+class FakeResponse:
+ def __init__(self, status_code):
+ self.status_code = status_code
+
+
+class FakeSession:
+ def __init__(self, statuses):
+ self.statuses = statuses
+
+ def get(self, url, **_kwargs):
+ return FakeResponse(self.statuses[url])
+
+
+def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session):
+ run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
+ db_session.add(run)
db_session.add(
Employee(
profile_key="staff:kept",
@@ -16,8 +31,8 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
)
db_session.add(
Employee(
- profile_key="staff:gone",
- canonical_url="https://www.hse.ru/staff/gone",
+ profile_key="staff:missing",
+ canonical_url="https://www.hse.ru/staff/missing",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
@@ -25,16 +40,53 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
)
db_session.commit()
- dismissed = _mark_dismissed(db_session, {"staff:kept"})
+ dismissed = _mark_dismissed(
+ db_session,
+ run,
+ {"staff:kept"},
+ FakeSession({"https://www.hse.ru/staff/missing": 200}),
+ 30,
+ )
+
+ assert dismissed == 0
+ assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
+ missing = db_session.query(Employee).filter_by(profile_key="staff:missing").one()
+ assert missing.status == "active"
+ assert missing.dismissed_at is None
+ change = db_session.query(CrawlRunEmployeeChange).one()
+ assert change.change_type == "missing_from_source"
+ assert change.profile_available is True
+
+
+def test_mark_dismissed_marks_missing_employee_when_profile_is_unavailable(db_session):
+ run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
+ employee = Employee(
+ profile_key="staff:gone",
+ canonical_url="https://www.hse.ru/staff/gone",
+ status="active",
+ first_seen_at=datetime.now(timezone.utc),
+ last_seen_at=datetime.now(timezone.utc),
+ )
+ db_session.add_all([run, employee])
+ db_session.commit()
+
+ dismissed = _mark_dismissed(
+ db_session,
+ run,
+ set(),
+ FakeSession({"https://www.hse.ru/staff/gone": 404}),
+ 30,
+ )
assert dismissed == 1
- assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
- gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
- assert gone.status == "dismissed"
- assert gone.dismissed_at is not None
+ assert employee.status == "dismissed"
+ assert employee.dismissed_at is not None
+ change = db_session.query(CrawlRunEmployeeChange).one()
+ assert change.change_type == "dismissed"
+ assert change.profile_available is False
-def test_upsert_employee_increments_new_count_for_new_employee(db_session):
+def test_upsert_employee_increments_new_count_and_records_change_for_new_employee(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
@@ -56,3 +108,6 @@ def test_upsert_employee_increments_new_count_for_new_employee(db_session):
db_session.commit()
assert run.new_count == 1
+ change = db_session.query(CrawlRunEmployeeChange).one()
+ assert change.change_type == "new"
+ assert change.full_name == "New Person"