feature: improve admin directory and crawl progress

Merge pull request 'feature: add MIEM employees parser service with admin UI and MCP' (#1 ) from feature/miem-employees-server into main
Reviewed-on: #1
2026-04-28 17:24:10 +03:00 · 2026-04-28 13:21:21 +00:00 · 2026-04-28 16:20:51 +03:00
36 changed files with 2909 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -41,6 +41,13 @@ uvicorn app.main:app --reload
 Админка: `http://localhost:8000/admin`.
 В админке доступны:
 - `Dashboard`: общая статистика, последний добавленный сотрудник, прогресс текущего/последнего парсинга и ручной запуск.
 - `Directory`: настраиваемая таблица сотрудников с фильтрами, сортировкой, пагинацией и выбором колонок.
 - `Employees`: простая legacy-таблица сотрудников.
 - `Runs`: история запусков, ошибки и progress bar.
 ## Docker Compose
 ```bash
@@ -57,7 +64,7 @@ docker compose up --build
 ## Парсинг
-Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на странице `Runs` или через REST:
+Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на `Dashboard` и странице `Runs` или через REST:
 ```bash
 curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=..."
@@ -67,9 +74,12 @@ curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=.
 - найденные сотрудники получают статус `active` и обновленный `last_seen_at`;
 - новые сотрудники добавляются в `employees`;
 - количество новых сотрудников за запуск сохраняется в `crawl_runs.new_count`;
 - активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`;
 - каждый успешный разбор сохраняет запись в `employee_snapshots`.
 Во время выполнения парсинга `found_count`, `parsed_count` и `error_count` обновляются в базе. Админка опрашивает `/api/crawl-runs/latest` и показывает прогресс как `parsed_count + error_count / found_count`.
 ## MCP
 Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>`.
@@ -100,4 +110,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql
 docker compose down
 ```
-Версия сервиса: `0.1.0`. Админка всегда показывает версии backend и frontend в footer.
+Версия сервиса: `0.2.0`. Админка всегда показывает версии backend и frontend в footer.
--- a/app/init.py
+++ b/app/init.py
@@ -0,0 +1 @@
 """MIEM employees service."""
--- a/app/admin.py
+++ b/app/admin.py
@@ -0,0 +1,204 @@
 from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request
 from fastapi.responses import HTMLResponse, RedirectResponse
 from fastapi.templating import Jinja2Templates
 from sqlalchemy import desc, func, or_, select
 from sqlalchemy.orm import Session
 from app.config import Settings, get_settings
 from app.db import SessionLocal, get_db
 from app.models import CrawlError, CrawlRun, Employee
 from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
 from app.services.admin_data import list_employees_page, run_payload, stats_payload
 from app.services.crawl_control import get_running_run, run_crawl_if_idle
 from app.version import BACKEND_VERSION, FRONTEND_VERSION
 router = APIRouter(prefix="/admin")
 templates = Jinja2Templates(directory="app/templates")
@router.get("", response_class=HTMLResponse)
 def dashboard(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
    require_admin(request, settings)
    counts = stats_payload(db)
    counts["runs"] = db.scalar(select(func.count()).select_from(CrawlRun)) or 0
    counts["errors"] = db.scalar(select(func.count()).select_from(CrawlError)) or 0
    runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all()
    return _render(request, "dashboard.html", {"counts": counts, "runs": runs, "latest_run": run_payload(runs[0]) if runs else None})
@router.get("/login", response_class=HTMLResponse)
 def login_form(request: Request):
    return _render(request, "login.html", {"error": None})
@router.post("/login")
 def login(
    request: Request,
    username: str = Form(...),
    password: str = Form(...),
    settings: Settings = Depends(get_settings),
 ):
    if not verify_admin(username, password, settings):
        return _render(request, "login.html", {"error": "Неверный логин или пароль"}, status_code=401)
    redirect = RedirectResponse("/admin", status_code=303)
    redirect.set_cookie(SESSION_COOKIE, sign_session(username, settings), httponly=True, samesite="lax")
    return redirect
@router.post("/logout")
 def logout():
    redirect = RedirectResponse("/admin/login", status_code=303)
    redirect.delete_cookie(SESSION_COOKIE)
    return redirect
@router.get("/employees", response_class=HTMLResponse)
 def employees(
    request: Request,
    status: str | None = None,
    q: str | None = None,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ):
    require_admin(request, settings)
    stmt = select(Employee)
    if status:
        stmt = stmt.where(Employee.status == status)
    if q:
        pattern = f"%{q}%"
        stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
    items = db.scalars(stmt.order_by(Employee.full_name).limit(200)).all()
    return _render(request, "employees.html", {"employees": items, "status": status or "", "q": q or ""})
@router.get("/directory", response_class=HTMLResponse)
 def directory(
    request: Request,
    status: str | None = None,
    q: str | None = None,
    started_from: str | None = None,
    started_to: str | None = None,
    has_email: str | None = None,
    sort: str = "full_name",
    direction: str = "asc",
    limit: int = 50,
    offset: int = 0,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ):
    require_admin(request, settings)
    parsed_started_from = _parse_date(started_from)
    parsed_started_to = _parse_date(started_to)
    parsed_has_email = None if has_email in (None, "") else has_email == "true"
    page = list_employees_page(
        db,
        status=status,
        q=q,
        started_from=parsed_started_from,
        started_to=parsed_started_to,
        has_email=parsed_has_email,
        sort=sort,
        direction=direction,
        limit=limit,
        offset=offset,
    )
    return _render(
        request,
        "directory.html",
        {
            "page": page,
            "filters": {
                "status": status or "",
                "q": q or "",
                "started_from": started_from or "",
                "started_to": started_to or "",
                "has_email": has_email or "",
                "sort": sort,
                "direction": direction,
                "limit": limit,
                "offset": offset,
            },
        },
    )
@router.get("/employees/{employee_id}", response_class=HTMLResponse)
 def employee_detail(
    employee_id: int,
    request: Request,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ):
    require_admin(request, settings)
    employee = db.get(Employee, employee_id)
    if not employee:
        return RedirectResponse("/admin/employees", status_code=303)
    snapshots = sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
    return _render(request, "employee_detail.html", {"employee": employee, "snapshots": snapshots})
@router.get("/runs", response_class=HTMLResponse)
 def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
    require_admin(request, settings)
    items = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(50)).all()
    errors = db.scalars(select(CrawlError).order_by(desc(CrawlError.created_at)).limit(50)).all()
    return _render(request, "runs.html", {"runs": items, "errors": errors})
@router.post("/runs")
 def trigger_run(
    request: Request,
    background_tasks: BackgroundTasks,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ):
    require_admin(request, settings)
    if get_running_run(db):
        return RedirectResponse("/admin/runs", status_code=303)
    def _crawl() -> None:
        with SessionLocal() as db:
            run_crawl_if_idle(db, settings)
    background_tasks.add_task(_crawl)
    return RedirectResponse("/admin/runs", status_code=303)
@router.post("/crawl-now")
 def crawl_now(
    request: Request,
    background_tasks: BackgroundTasks,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ):
    require_admin(request, settings)
    if get_running_run(db):
        return RedirectResponse("/admin", status_code=303)
    def _crawl() -> None:
        with SessionLocal() as db:
            run_crawl_if_idle(db, settings)
    background_tasks.add_task(_crawl)
    return RedirectResponse("/admin", status_code=303)
 def _render(request: Request, template: str, context: dict, status_code: int = 200) -> HTMLResponse:
    payload = {
        "request": request,
        "backend_version": BACKEND_VERSION,
        "frontend_version": FRONTEND_VERSION,
        **context,
    }
    return templates.TemplateResponse(template, payload, status_code=status_code)
 def _parse_date(value: str | None):
    if not value:
        return None
    try:
        from datetime import date
        return date.fromisoformat(value)
    except ValueError:
        return None
--- a/app/api.py
+++ b/app/api.py
@@ -0,0 +1,133 @@
 from datetime import date
 from fastapi import APIRouter, BackgroundTasks, Depends, Request
 from sqlalchemy import desc, select
 from sqlalchemy.orm import Session
 from app.config import Settings, get_settings
 from app.db import SessionLocal, get_db
 from app.models import CrawlRun, Employee
 from app.security import require_admin
 from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
 from app.services.crawl_control import get_running_run, run_crawl_if_idle
 from app.version import BACKEND_VERSION, FRONTEND_VERSION
 router = APIRouter(prefix="/api")
@router.get("/health")
 def health() -> dict:
    return {"status": "ok", "backend_version": BACKEND_VERSION, "frontend_version": FRONTEND_VERSION}
@router.get("/employees")
 def list_employees(
    request: Request,
    status: str | None = None,
    q: str | None = None,
    started_from: date | None = None,
    started_to: date | None = None,
    has_email: bool | None = None,
    sort: str = "full_name",
    direction: str = "asc",
    limit: int = 50,
    offset: int = 0,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_admin(request, settings)
    return list_employees_page(
        db,
        status=status,
        q=q,
        started_from=started_from,
        started_to=started_to,
        has_email=has_email,
        sort=sort,
        direction=direction,
        limit=limit,
        offset=offset,
    )
@router.get("/employees/{employee_id}")
 def get_employee(
    employee_id: int,
    request: Request,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_admin(request, settings)
    employee = db.get(Employee, employee_id)
    if not employee:
        return {"error": "not_found"}
    return _employee_detail(employee)
@router.get("/crawl-runs")
 def list_crawl_runs(
    request: Request,
    limit: int = 20,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_admin(request, settings)
    runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(limit)).all()
    return {"items": [run_payload(run) for run in runs]}
@router.get("/crawl-runs/latest")
 def latest_crawl_run(
    request: Request,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_admin(request, settings)
    running = get_running_run(db)
    latest = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
    return {"running": run_payload(running), "latest": run_payload(latest)}
@router.post("/crawl-runs")
 def trigger_crawl(
    request: Request,
    background_tasks: BackgroundTasks,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_admin(request, settings)
    running = get_running_run(db)
    if running:
        return {"status": "already_running", "run": run_payload(running)}
    def _crawl() -> None:
        with SessionLocal() as db:
            run_crawl_if_idle(db, settings)
    background_tasks.add_task(_crawl)
    return {"status": "scheduled"}
@router.get("/stats")
 def stats(
    request: Request,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_admin(request, settings)
    return stats_payload(db)
 def _employee_summary(employee: Employee) -> dict:
    return employee_display_payload(employee)
 def _employee_detail(employee: Employee) -> dict:
    data = _employee_summary(employee)
    data["current_data"] = employee.current_data
    data["tabs"] = [{"title": tab.title, "href": tab.href, "data_index": tab.data_index} for tab in employee.tabs]
    return data
 def _run_summary(run: CrawlRun) -> dict:
    return run_payload(run) or {}
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,25 @@
 from functools import lru_cache
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
    database_url: str = "sqlite:///./miem_workers.db"
    source_url: str = "https://miem.hse.ru/persons"
    crawl_cron: str = "0 3 * * 1"
    crawl_limit: int | None = None
    request_timeout: int = 30
    request_delay_seconds: float = 1.0
    parser_use_playwright: bool = False
    admin_username: str = "admin"
    admin_password: str = "admin"
    session_secret: str = Field(default="dev-session-secret", min_length=8)
    mcp_token: str = "dev-mcp-token"
@lru_cache
 def get_settings() -> Settings:
    return Settings()
--- a/app/db.py
+++ b/app/db.py
@@ -0,0 +1,35 @@
 from collections.abc import Generator
 from sqlalchemy import create_engine
 from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
 from app.config import get_settings
 class Base(DeclarativeBase):
    pass
 def _connect_args(database_url: str) -> dict[str, object]:
    if database_url.startswith("sqlite"):
        return {"check_same_thread": False}
    return {}
 settings = get_settings()
 engine = create_engine(settings.database_url, connect_args=_connect_args(settings.database_url))
 SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)
 def init_db() -> None:
    import app.models  # noqa: F401
    Base.metadata.create_all(bind=engine)
 def get_db() -> Generator[Session, None, None]:
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,24 @@
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from app.admin import router as admin_router
 from app.api import router as api_router
 from app.db import init_db
 from app.mcp import router as mcp_router
 from app.version import BACKEND_VERSION
 app = FastAPI(title="MIEM Employees", version=BACKEND_VERSION)
 app.mount("/static", StaticFiles(directory="app/static"), name="static")
 app.include_router(api_router)
 app.include_router(admin_router)
 app.include_router(mcp_router)
@app.on_event("startup")
 def startup() -> None:
    init_db()
@app.get("/")
 def root() -> dict:
    return {"service": "miem-employees", "version": BACKEND_VERSION, "admin": "/admin"}
--- a/app/mcp.py
+++ b/app/mcp.py
@@ -0,0 +1,170 @@
 import json
 from fastapi import APIRouter, Depends, Request
 from sqlalchemy import desc, or_, select
 from sqlalchemy.orm import Session
 from app.config import Settings, get_settings
 from app.db import get_db
 from app.models import CrawlRun, Employee
 from app.security import require_mcp_token
 router = APIRouter(prefix="/mcp")
 TOOLS = [
    {
        "name": "search_employees",
        "description": "Search MIEM employees by name or profile URL.",
        "inputSchema": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "status": {"type": "string", "enum": ["active", "dismissed"]},
                "limit": {"type": "integer", "default": 20},
            },
            "required": ["query"],
        },
    },
    {
        "name": "get_employee",
        "description": "Get one employee by profile id, profile key, or canonical URL.",
        "inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
    },
    {
        "name": "list_employee_publications",
        "description": "List publications parsed from an employee profile.",
        "inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
    },
    {
        "name": "list_employee_courses",
        "description": "List teaching courses parsed from an employee profile.",
        "inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
    },
    {
        "name": "get_crawl_status",
        "description": "Return the latest crawl run status.",
        "inputSchema": {"type": "object", "properties": {}},
    },
 ]
@router.post("")
 async def mcp_http(
    request: Request,
    db: Session = Depends(get_db),
    settings: Settings = Depends(get_settings),
 ) -> dict:
    require_mcp_token(request, settings)
    payload = await request.json()
    method = payload.get("method")
    request_id = payload.get("id")
    params = payload.get("params") or {}
    try:
        if method == "initialize":
            result = {
                "protocolVersion": "2024-11-05",
                "serverInfo": {"name": "miem-employees", "version": "0.1.0"},
                "capabilities": {"tools": {}},
            }
        elif method == "tools/list":
            result = {"tools": TOOLS}
        elif method == "tools/call":
            result = _call_tool(db, params.get("name"), params.get("arguments") or {})
        else:
            return {"jsonrpc": "2.0", "id": request_id, "error": {"code": -32601, "message": "Method not found"}}
        return {"jsonrpc": "2.0", "id": request_id, "result": result}
    except Exception as exc:
        return {"jsonrpc": "2.0", "id": request_id, "error": {"code": -32000, "message": str(exc)}}
 def _call_tool(db: Session, name: str, arguments: dict) -> dict:
    if name == "search_employees":
        return _tool_response(_search_employees(db, arguments))
    if name == "get_employee":
        employee = _find_employee(db, arguments["profile_id_or_url"])
        return _tool_response(_employee_payload(employee) if employee else {"error": "not_found"})
    if name == "list_employee_publications":
        employee = _find_employee(db, arguments["profile_id_or_url"])
        return _tool_response(_collect_section_items(employee, "publications"))
    if name == "list_employee_courses":
        employee = _find_employee(db, arguments["profile_id_or_url"])
        return _tool_response(_collect_section_items(employee, "courses_by_year"))
    if name == "get_crawl_status":
        run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
        return _tool_response(_run_payload(run) if run else {"status": "never_run"})
    raise ValueError(f"Unknown tool: {name}")
 def _search_employees(db: Session, arguments: dict) -> list[dict]:
    query = arguments.get("query", "")
    limit = min(int(arguments.get("limit") or 20), 100)
    stmt = select(Employee)
    if arguments.get("status"):
        stmt = stmt.where(Employee.status == arguments["status"])
    if query:
        pattern = f"%{query}%"
        stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
    employees = db.scalars(stmt.order_by(Employee.full_name).limit(limit)).all()
    return [_employee_payload(employee, include_data=False) for employee in employees]
 def _find_employee(db: Session, value: str) -> Employee | None:
    pattern = value.strip()
    stmt = select(Employee).where(
        or_(
            Employee.profile_key == pattern,
            Employee.profile_id == pattern,
            Employee.canonical_url == pattern,
            Employee.canonical_url.ilike(f"%{pattern}%"),
        )
    )
    return db.scalar(stmt.limit(1))
 def _collect_section_items(employee: Employee | None, section_type: str) -> dict:
    if not employee or not employee.current_data:
        return {"items": []}
    items = []
    for section in employee.current_data.get("sections") or []:
        if section.get("type") != section_type:
            continue
        if section_type == "publications":
            items.extend(section.get("publications") or [])
        elif section_type == "courses_by_year":
            items.extend(section.get("courses") or [])
    return {"employee": _employee_payload(employee, include_data=False), "items": items}
 def _employee_payload(employee: Employee, include_data: bool = True) -> dict:
    payload = {
        "profile_key": employee.profile_key,
        "profile_id": employee.profile_id,
        "full_name": employee.full_name,
        "status": employee.status,
        "canonical_url": employee.canonical_url,
        "last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
        "dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
    }
    if include_data:
        payload["data"] = employee.current_data
    return payload
 def _run_payload(run: CrawlRun) -> dict:
    return {
        "id": run.id,
        "status": run.status,
        "source_url": run.source_url,
        "started_at": run.started_at.isoformat() if run.started_at else None,
        "finished_at": run.finished_at.isoformat() if run.finished_at else None,
        "found_count": run.found_count,
        "parsed_count": run.parsed_count,
        "error_count": run.error_count,
        "dismissed_count": run.dismissed_count,
    }
 def _tool_response(data: object) -> dict:
    return {"content": [{"type": "text", "text": json.dumps(data, ensure_ascii=False, default=str)}]}
--- a/app/models.py
+++ b/app/models.py
@@ -0,0 +1,110 @@
 from datetime import datetime, timezone
 from sqlalchemy import DateTime, ForeignKey, Index, Integer, LargeBinary, String, Text, UniqueConstraint
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from sqlalchemy.types import JSON
 from app.db import Base
 def utcnow() -> datetime:
    return datetime.now(timezone.utc)
 json_type = JSON().with_variant(JSONB, "postgresql")
 class Employee(Base):
    __tablename__ = "employees"
    __table_args__ = (
        UniqueConstraint("profile_key", name="uq_employees_profile_key"),
        Index("ix_employees_full_name", "full_name"),
        Index("ix_employees_status", "status"),
    )
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
    profile_type: Mapped[str | None] = mapped_column(String(50))
    profile_id: Mapped[str | None] = mapped_column(String(255))
    canonical_url: Mapped[str] = mapped_column(Text, nullable=False)
    full_name: Mapped[str | None] = mapped_column(Text)
    status: Mapped[str] = mapped_column(String(32), default="active", nullable=False)
    first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
    last_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
    dismissed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
    parser_version: Mapped[str | None] = mapped_column(String(32))
    current_data: Mapped[dict | None] = mapped_column(json_type)
    current_checksum: Mapped[str | None] = mapped_column(String(64))
    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
    updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False)
    snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
    tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
 class EmployeeSnapshot(Base):
    __tablename__ = "employee_snapshots"
    __table_args__ = (Index("ix_employee_snapshots_employee_id", "employee_id"),)
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False)
    crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id"))
    parsed_data: Mapped[dict] = mapped_column(json_type, nullable=False)
    html_snapshot: Mapped[bytes | None] = mapped_column(LargeBinary)
    checksum: Mapped[str] = mapped_column(String(64), nullable=False)
    parser_version: Mapped[str | None] = mapped_column(String(32))
    captured_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
    employee: Mapped[Employee] = relationship(back_populates="snapshots")
 class CrawlRun(Base):
    __tablename__ = "crawl_runs"
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    source_url: Mapped[str] = mapped_column(Text, nullable=False)
    status: Mapped[str] = mapped_column(String(32), default="running", nullable=False)
    started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
    finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
    found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
    parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
    new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
    error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
    dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
    message: Mapped[str | None] = mapped_column(Text)
 class CrawlError(Base):
    __tablename__ = "crawl_errors"
    __table_args__ = (Index("ix_crawl_errors_run_id", "crawl_run_id"),)
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
    profile_url: Mapped[str | None] = mapped_column(Text)
    error_type: Mapped[str] = mapped_column(String(255), nullable=False)
    message: Mapped[str] = mapped_column(Text, nullable=False)
    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
 class ProfileTab(Base):
    __tablename__ = "profile_tabs"
    __table_args__ = (Index("ix_profile_tabs_employee_id", "employee_id"),)
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False)
    title: Mapped[str] = mapped_column(Text, nullable=False)
    href: Mapped[str] = mapped_column(Text, nullable=False)
    data_index: Mapped[str | None] = mapped_column(String(64))
    employee: Mapped[Employee] = relationship(back_populates="tabs")
 class ParserSource(Base):
    __tablename__ = "parser_sources"
    __table_args__ = (UniqueConstraint("source_url", name="uq_parser_sources_source_url"),)
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    source_url: Mapped[str] = mapped_column(Text, nullable=False)
    enabled: Mapped[bool] = mapped_column(default=True, nullable=False)
    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
--- a/app/parser/init.py
+++ b/app/parser/init.py
@@ -0,0 +1 @@
 """HTML parsing helpers for HSE/MIEM employee pages."""
--- a/app/parser/collector.py
+++ b/app/parser/collector.py
@@ -0,0 +1,19 @@
 from bs4 import BeautifulSoup
 from requests import Session
 from app.parser.profile_url import normalize_profile_url
 def collect_profile_links(session: Session, source_url: str, headers: dict[str, str], timeout: int) -> list[str]:
    response = session.get(source_url, headers=headers, timeout=timeout)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    seen: set[str] = set()
    collected: list[str] = []
    for anchor in soup.find_all("a", href=True):
        normalized = normalize_profile_url(anchor["href"])
        if normalized and normalized not in seen:
            seen.add(normalized)
            collected.append(normalized)
    return collected
--- a/app/parser/profile.py
+++ b/app/parser/profile.py
@@ -0,0 +1,380 @@
 import re
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString, Tag
 from requests import Session
 from app.parser.profile_url import normalize_profile_url, parse_profile_identity
 from app.version import BACKEND_VERSION
 _YEAR_PATTERN = re.compile(r"Начал[аи]?\s+работать.*?ВШЭ.*?(\d{4})", re.IGNORECASE)
 _EMAIL_PATTERN = re.compile(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})")
 _PHONE_PATTERN = re.compile(r"(?:Телефон|Phone)\s*:\s*([+()\d\-\s]{8,})", re.IGNORECASE)
 def normalize_ws(value: str | None) -> str:
    return re.sub(r"\s+", " ", value or "").strip()
 def extract_person_tabs(soup: BeautifulSoup, source_url: str) -> list[dict[str, str | None]]:
    selectors = (
        "div.person-menu.is-desktop.small.person-menu-addition",
        ".person-menu",
    )
    for selector in selectors:
        menu = soup.select_one(selector)
        if not menu:
            continue
        tabs = []
        for anchor in menu.select("a[href]"):
            title = normalize_ws(anchor.get_text(" ", strip=True))
            href = anchor.get("href", "").strip()
            if title and href:
                tabs.append(
                    {
                        "data_index": anchor.get("data-index"),
                        "title": title,
                        "href": urljoin(source_url, href),
                    }
                )
        if tabs:
            return _dedupe_tabs(tabs)
    return []
 def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict:
    name_node = soup.select_one("h1.person-caption") or soup.find("h1")
    text = normalize_ws(soup.get_text(" ", strip=True))
    year_match = _YEAR_PATTERN.search(text)
    contacts = {"phones": [], "emails": [], "address": None, "items": []}
    for email in _EMAIL_PATTERN.findall(text):
        if email not in contacts["emails"]:
            contacts["emails"].append(email)
    for phone in _PHONE_PATTERN.findall(text):
        normalized_phone = normalize_ws(phone)
        if normalized_phone and normalized_phone not in contacts["phones"]:
            contacts["phones"].append(normalized_phone)
    address_match = re.search(
        r"(Адрес[:\s].{0,220}?)(?:\s+Время|\s+Расписание|\s+SPIN|\s+ORCID|$)",
        text,
        flags=re.IGNORECASE,
    )
    if address_match:
        contacts["address"] = normalize_ws(address_match.group(1)).rstrip(",")
    positions = []
    for li in soup.select("ul.g-ul.g-list.small.employment-add li, ul.employment-add li"):
        value = normalize_ws(li.get_text(" ", strip=True))
        if value:
            positions.append(value)
    external_ids = []
    id_domains = (
        ("ORCID", "orcid.org"),
        ("Scopus AuthorID", "scopus.com"),
        ("ResearcherID", "webofscience.com"),
        ("Google Scholar", "scholar.google."),
        ("SPIN РИНЦ", "elibrary.ru"),
    )
    for anchor in soup.select("a[href]"):
        href = anchor.get("href", "").strip()
        label = normalize_ws(anchor.get_text(" ", strip=True))
        for system, marker in id_domains:
            if marker in href:
                external_ids.append({"system": system, "value": label or system, "url": href})
                break
    return {
        "source_url": source_url,
        "full_name": normalize_ws(name_node.get_text(" ", strip=True)) if name_node else None,
        "positions": positions,
        "hse_start_year": int(year_match.group(1)) if year_match else None,
        "contacts": contacts,
        "external_ids": _dedupe_dicts(external_ids),
    }
 def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
    sections = []
    for h2 in soup.select("h2"):
        title = normalize_ws(h2.get_text(" ", strip=True))
        if not title or "расписание занятий" in title.lower():
            continue
        nodes = _collect_between_h2(h2)
        raw_text = _nodes_raw_text(nodes)
        paragraphs = _nodes_paragraphs(nodes)
        items = _nodes_list_items(nodes)
        links = []
        for node in nodes:
            if isinstance(node, Tag):
                links.extend(_extract_links(node, source_url))
        section_type = _infer_section_type(title, nodes)
        section = {
            "title": title,
            "slug": _slugify(title),
            "type": section_type,
            "raw_text": raw_text,
            "paragraphs": paragraphs,
            "items": items,
            "links": links,
        }
        if section_type == "publications":
            section["publications_count"], section["publications"] = _parse_publications(title, nodes, source_url)
            section["items"] = [item["text"] for item in section["publications"] if item.get("text")]
        elif section_type == "courses_by_year":
            section["academic_year"], section["courses"] = _parse_courses(title, nodes, source_url)
            section.pop("items", None)
            section.pop("links", None)
        elif section_type == "table":
            section["table"] = _parse_table(nodes, source_url)
        elif "выпускные квалификационные работы студентов ниу вшэ" in title.lower():
            section["items"] = _parse_vkr_items(nodes)
        year_entries = _parse_year_entries(nodes, source_url)
        if year_entries:
            section["year_entries"] = year_entries
            if section_type in {"generic", "paragraphs"}:
                section["type"] = "year_blocks"
        sections.append(section)
    return sections
 def parse_person_profile(
    session: Session,
    source_url: str,
    headers: dict[str, str],
    timeout: int,
    use_playwright: bool = False,
 ) -> dict | None:
    normalized_url = normalize_profile_url(source_url)
    if not normalized_url:
        return None
    response = session.get(normalized_url, headers=headers, timeout=timeout)
    response.raise_for_status()
    html = response.text
    if use_playwright:
        html = _render_with_playwright(normalized_url, html)
    soup = BeautifulSoup(html, "html.parser")
    profile_type, profile_id = parse_profile_identity(normalized_url)
    header = extract_person_header(soup, normalized_url)
    tabs = extract_person_tabs(soup, normalized_url)
    sections = extract_sections(soup, normalized_url)
    internal_links = [tab["href"] for tab in tabs if tab.get("href")]
    return {
        "source_url": normalized_url,
        "profile_type": profile_type,
        "profile_id": profile_id,
        "full_name": header.get("full_name"),
        "positions": header.get("positions") or [],
        "hse_start_year": header.get("hse_start_year"),
        "contacts": header.get("contacts") or {},
        "external_ids": header.get("external_ids") or [],
        "tabs": tabs,
        "sections": sections,
        "employee_internal_links": internal_links,
        "parser_version": BACKEND_VERSION,
        "_html": html,
    }
 def _render_with_playwright(source_url: str, fallback_html: str) -> str:
    try:
        from playwright.sync_api import sync_playwright
    except Exception:
        return fallback_html
    try:
        with sync_playwright() as playwright:
            browser = playwright.chromium.launch(headless=True)
            page = browser.new_page()
            page.goto(source_url, wait_until="domcontentloaded", timeout=45000)
            for index in range(page.locator(".person-menu a").count()):
                try:
                    page.locator(".person-menu a").nth(index).click(timeout=2500, force=True)
                    page.wait_for_timeout(450)
                except Exception:
                    continue
            html = page.content()
            browser.close()
            return html
    except Exception:
        return fallback_html
 def _collect_between_h2(start_h2: Tag) -> list[Tag | NavigableString | str]:
    nodes = []
    for sibling in start_h2.next_siblings:
        if isinstance(sibling, Tag) and sibling.name == "h2":
            break
        if isinstance(sibling, NavigableString) and not normalize_ws(str(sibling)):
            continue
        nodes.append(sibling)
    return nodes
 def _extract_links(node: Tag, source_url: str) -> list[dict[str, str]]:
    links = []
    for anchor in node.select("a[href]"):
        text = normalize_ws(anchor.get_text(" ", strip=True))
        href = anchor.get("href", "").strip()
        if text and href and "timetable" not in href.lower() and "расписание" not in text.lower():
            links.append({"text": text, "url": urljoin(source_url, href)})
    return links
 def _nodes_raw_text(nodes: list) -> str:
    chunks = []
    for node in nodes:
        text = normalize_ws(node.get_text(" ", strip=True) if isinstance(node, Tag) else str(node))
        if text:
            chunks.append(text)
    return "\n".join(chunks)
 def _nodes_paragraphs(nodes: list) -> list[str]:
    paragraphs = []
    for node in nodes:
        if isinstance(node, Tag):
            paragraphs.extend(normalize_ws(p.get_text(" ", strip=True)) for p in node.select("p"))
    return [p for p in paragraphs if p]
 def _nodes_list_items(nodes: list) -> list[str]:
    items = []
    for node in nodes:
        if isinstance(node, Tag):
            items.extend(normalize_ws(li.get_text(" ", strip=True)) for li in node.select("li"))
    return [item for item in items if item and "расписание" not in item.lower()]
 def _infer_section_type(title: str, nodes: list) -> str:
    lowered = title.lower()
    if _has_table(nodes):
        return "table"
    if "публикац" in lowered:
        return "publications"
    if "учебные курсы" in lowered:
        return "courses_by_year"
    if _nodes_list_items(nodes):
        return "list"
    if _nodes_paragraphs(nodes):
        return "paragraphs"
    return "generic"
 def _has_table(nodes: list) -> bool:
    return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes)
 def _parse_table(nodes: list, source_url: str) -> dict:
    for node in nodes:
        if not isinstance(node, Tag):
            continue
        table = node if node.name == "table" else node.find("table")
        if not table:
            continue
        headers = [normalize_ws(th.get_text(" ", strip=True)) for th in table.select("th")]
        rows = []
        for tr in table.select("tr"):
            cells = [normalize_ws(td.get_text(" ", strip=True)) for td in tr.select("td")]
            if cells:
                link = tr.find("a", href=True)
                rows.append({"cells": cells, "link_url": urljoin(source_url, link["href"]) if link else None})
        return {"headers": headers, "rows": rows}
    return {"headers": [], "rows": []}
 def _parse_publications(title: str, nodes: list, source_url: str) -> tuple[int | None, list[dict]]:
    count_match = re.search(r"(\d+)\s*$", title)
    publications = []
    for node in nodes:
        if not isinstance(node, Tag):
            continue
        for li in node.select("li"):
            text = normalize_ws(li.get_text(" ", strip=True))
            anchor = li.find("a", href=True)
            if text:
                publications.append(
                    {
                        "title": normalize_ws(anchor.get_text(" ", strip=True)) if anchor else text,
                        "url": urljoin(source_url, anchor["href"]) if anchor else None,
                        "text": text,
                    }
                )
        if publications:
            break
    if not publications:
        publications = [{"title": line, "url": None, "text": line} for line in _nodes_raw_text(nodes).split("\n") if line]
    return int(count_match.group(1)) if count_match else None, publications
 def _parse_courses(title: str, nodes: list, source_url: str) -> tuple[str | None, list[dict]]:
    year_match = re.search(r"(\d{4}/\d{4})", title)
    courses = []
    for node in nodes:
        if isinstance(node, Tag):
            for li in node.select("li"):
                anchor = li.find("a", href=True)
                course_title = normalize_ws(anchor.get_text(" ", strip=True) if anchor else li.get_text(" ", strip=True))
                if course_title:
                    courses.append({"title": course_title, "url": urljoin(source_url, anchor["href"]) if anchor else None})
    return year_match.group(1) if year_match else None, _dedupe_dicts(courses)
 def _parse_year_entries(nodes: list, source_url: str) -> list[dict]:
    entries = []
    for node in nodes:
        if not isinstance(node, Tag):
            continue
        for year_node in node.select(".person-list-hangover"):
            year_match = re.search(r"(19\d{2}|20\d{2})", year_node.get_text(" ", strip=True))
            parent = year_node.parent
            if parent:
                entries.append(
                    {
                        "year": int(year_match.group(1)) if year_match else None,
                        "text": normalize_ws(parent.get_text(" ", strip=True)),
                        "links": _extract_links(parent, source_url),
                    }
                )
    return entries
 def _parse_vkr_items(nodes: list) -> list[str]:
    items = []
    for node in nodes:
        if isinstance(node, Tag):
            items.extend(normalize_ws(li.get_text(" ", strip=True)) for li in node.select("li"))
    return [item for item in dict.fromkeys(items) if item]
 def _slugify(value: str) -> str:
    cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
    return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"
 def _dedupe_tabs(items: list[dict]) -> list[dict]:
    seen = set()
    unique = []
    for item in items:
        key = (item.get("title"), item.get("href"))
        if key not in seen:
            seen.add(key)
            unique.append(item)
    return unique
 def _dedupe_dicts(items: list[dict]) -> list[dict]:
    seen = set()
    unique = []
    for item in items:
        key = tuple(sorted(item.items()))
        if key not in seen:
            seen.add(key)
            unique.append(item)
    return unique
--- a/app/parser/profile_url.py
+++ b/app/parser/profile_url.py
@@ -0,0 +1,46 @@
 import re
 from urllib.parse import urljoin, urlsplit, urlunsplit
 BASE_URL = "https://www.hse.ru"
 _ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$")
 _STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$")
 def normalize_profile_url(href: str | None) -> str | None:
    if not href:
        return None
    candidate = urljoin(BASE_URL + "/", href.strip())
    split = urlsplit(candidate)
    path = split.path.rstrip("/")
    org_match = _ORG_PATTERN.match(path + "/")
    if org_match:
        return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", ""))
    staff_match = _STAFF_PATTERN.match(path + "/")
    if staff_match:
        return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", ""))
    return None
 def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]:
    normalized = normalize_profile_url(profile_url)
    if not normalized:
        return None, None
    path = urlsplit(normalized).path.rstrip("/")
    org_match = _ORG_PATTERN.match(path + "/")
    if org_match:
        return "org_person", org_match.group(1)
    staff_match = _STAFF_PATTERN.match(path + "/")
    if staff_match:
        return "staff", staff_match.group(1)
    return None, None
 def profile_key(profile_url: str) -> str | None:
    profile_type, profile_id = parse_profile_identity(profile_url)
    if not profile_type or not profile_id:
        return None
    return f"{profile_type}:{profile_id}"
--- a/app/security.py
+++ b/app/security.py
@@ -0,0 +1,52 @@
 import base64
 import hashlib
 import hmac
 import json
 import time
 from fastapi import HTTPException, Request, status
 from app.config import Settings
 SESSION_COOKIE = "miem_admin_session"
 def verify_admin(username: str, password: str, settings: Settings) -> bool:
    return hmac.compare_digest(username, settings.admin_username) and hmac.compare_digest(
        password, settings.admin_password
    )
 def sign_session(username: str, settings: Settings) -> str:
    payload = base64.urlsafe_b64encode(
        json.dumps({"sub": username, "iat": int(time.time())}, separators=(",", ":")).encode("utf-8")
    ).decode("ascii")
    signature = hmac.new(settings.session_secret.encode("utf-8"), payload.encode("ascii"), hashlib.sha256).hexdigest()
    return f"{payload}.{signature}"
 def read_session(token: str | None, settings: Settings) -> str | None:
    if not token or "." not in token:
        return None
    payload, signature = token.rsplit(".", 1)
    expected = hmac.new(settings.session_secret.encode("utf-8"), payload.encode("ascii"), hashlib.sha256).hexdigest()
    if not hmac.compare_digest(signature, expected):
        return None
    try:
        data = json.loads(base64.urlsafe_b64decode(payload.encode("ascii")))
    except Exception:
        return None
    return data.get("sub")
 def require_admin(request: Request, settings: Settings) -> str:
    username = read_session(request.cookies.get(SESSION_COOKIE), settings)
    if not username:
        raise HTTPException(status_code=status.HTTP_303_SEE_OTHER, headers={"Location": "/admin/login"})
    return username
 def require_mcp_token(request: Request, settings: Settings) -> None:
    auth = request.headers.get("authorization", "")
    if not auth.startswith("Bearer ") or not hmac.compare_digest(auth.removeprefix("Bearer ").strip(), settings.mcp_token):
        raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid MCP token")
--- a/app/services/init.py
+++ b/app/services/init.py
@@ -0,0 +1 @@
 """Application services."""
--- a/app/services/admin_data.py
+++ b/app/services/admin_data.py
@@ -0,0 +1,159 @@
 from __future__ import annotations
 from datetime import date, datetime, time
 from math import ceil
 from typing import Any
 from sqlalchemy import Select, Text, and_, desc, func, or_, select
 from sqlalchemy.orm import Session
 from app.models import CrawlRun, Employee
 EMPLOYEE_SORTS = {
    "full_name": Employee.full_name,
    "status": Employee.status,
    "first_seen_at": Employee.first_seen_at,
    "last_seen_at": Employee.last_seen_at,
    "dismissed_at": Employee.dismissed_at,
    "hse_start_year": Employee.current_data["hse_start_year"].as_integer(),
 }
 def employee_display_payload(employee: Employee) -> dict[str, Any]:
    data = employee.current_data or {}
    contacts = data.get("contacts") or {}
    sections = data.get("sections") or []
    emails = contacts.get("emails") or []
    phones = contacts.get("phones") or []
    return {
        "id": employee.id,
        "full_name": employee.full_name,
        "status": employee.status,
        "canonical_url": employee.canonical_url,
        "positions": data.get("positions") or [],
        "positions_text": "; ".join(data.get("positions") or []),
        "hse_start_year": data.get("hse_start_year"),
        "emails": emails,
        "email_text": ", ".join(emails),
        "phones": phones,
        "phone_text": ", ".join(phones),
        "address": contacts.get("address"),
        "publications_count": _count_section_items(sections, "publications"),
        "courses_count": _count_section_items(sections, "courses_by_year"),
        "first_seen_at": employee.first_seen_at.isoformat() if employee.first_seen_at else None,
        "last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
        "dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
    }
 def build_employee_query(
    *,
    status: str | None = None,
    q: str | None = None,
    started_from: date | None = None,
    started_to: date | None = None,
    has_email: bool | None = None,
 ) -> Select[tuple[Employee]]:
    stmt = select(Employee)
    filters = []
    if status:
        filters.append(Employee.status == status)
    if q:
        pattern = f"%{q}%"
        filters.append(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
    if started_from:
        filters.append(Employee.first_seen_at >= datetime.combine(started_from, time.min))
    if started_to:
        filters.append(Employee.first_seen_at <= datetime.combine(started_to, time.max))
    if has_email is True:
        filters.append(Employee.current_data.cast(Text).ilike("%@%"))
    elif has_email is False:
        filters.append(or_(Employee.current_data.is_(None), ~Employee.current_data.cast(Text).ilike("%@%")))
    if filters:
        stmt = stmt.where(and_(*filters))
    return stmt
 def list_employees_page(
    db: Session,
    *,
    status: str | None = None,
    q: str | None = None,
    started_from: date | None = None,
    started_to: date | None = None,
    has_email: bool | None = None,
    sort: str = "full_name",
    direction: str = "asc",
    limit: int = 50,
    offset: int = 0,
 ) -> dict[str, Any]:
    limit = max(1, min(limit, 200))
    offset = max(0, offset)
    base_stmt = build_employee_query(
        status=status,
        q=q,
        started_from=started_from,
        started_to=started_to,
        has_email=has_email,
    )
    total = db.scalar(select(func.count()).select_from(base_stmt.subquery())) or 0
    sort_column = EMPLOYEE_SORTS.get(sort, Employee.full_name)
    order = desc(sort_column) if direction == "desc" else sort_column
    employees = db.scalars(base_stmt.order_by(order).limit(limit).offset(offset)).all()
    return {
        "items": [employee_display_payload(employee) for employee in employees],
        "total": total,
        "limit": limit,
        "offset": offset,
        "pages": ceil(total / limit) if total else 0,
        "page": (offset // limit) + 1,
    }
 def stats_payload(db: Session) -> dict[str, Any]:
    latest_run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
    running_run = db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
    latest_added = db.scalar(select(Employee).order_by(desc(Employee.first_seen_at)).limit(1))
    return {
        "total": db.scalar(select(func.count()).select_from(Employee)) or 0,
        "active": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "active")) or 0,
        "dismissed": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "dismissed")) or 0,
        "new_in_last_run": latest_run.new_count if latest_run else 0,
        "latest_added": employee_display_payload(latest_added) if latest_added else None,
        "latest_run": run_payload(latest_run) if latest_run else None,
        "current_running_run": run_payload(running_run) if running_run else None,
    }
 def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
    if not run:
        return None
    processed = run.parsed_count + run.error_count
    percent = round((processed / run.found_count) * 100, 1) if run.found_count else 0
    return {
        "id": run.id,
        "source_url": run.source_url,
        "status": run.status,
        "started_at": run.started_at.isoformat() if run.started_at else None,
        "finished_at": run.finished_at.isoformat() if run.finished_at else None,
        "found_count": run.found_count,
        "parsed_count": run.parsed_count,
        "new_count": run.new_count,
        "error_count": run.error_count,
        "dismissed_count": run.dismissed_count,
        "processed_count": processed,
        "progress_percent": percent,
        "message": run.message,
    }
 def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
    total = 0
    for section in sections:
        if section.get("type") != section_type:
            continue
        if section_type == "publications":
            total += len(section.get("publications") or section.get("items") or [])
        elif section_type == "courses_by_year":
            total += len(section.get("courses") or [])
    return total
--- a/app/services/crawl_control.py
+++ b/app/services/crawl_control.py
@@ -0,0 +1,17 @@
 from sqlalchemy import desc, select
 from sqlalchemy.orm import Session
 from app.config import Settings
 from app.models import CrawlRun
 from app.services.crawler import run_crawl
 def get_running_run(db: Session) -> CrawlRun | None:
    return db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
 def run_crawl_if_idle(db: Session, settings: Settings) -> tuple[CrawlRun, bool]:
    running = get_running_run(db)
    if running:
        return running, False
    return run_crawl(db, settings), True
--- a/app/services/crawler.py
+++ b/app/services/crawler.py
@@ -0,0 +1,160 @@
 import gzip
 import hashlib
 import json
 import time
 from datetime import datetime, timezone
 import requests
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 from app.config import Settings
 from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab
 from app.parser.collector import collect_profile_links
 from app.parser.profile import parse_person_profile
 from app.parser.profile_url import profile_key
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
 }
 def run_crawl(db: Session, settings: Settings) -> CrawlRun:
    source = _ensure_source(db, settings.source_url)
    run = CrawlRun(source_url=source.source_url, status="running")
    db.add(run)
    db.commit()
    db.refresh(run)
    found_keys: set[str] = set()
    parsed_count = 0
    try:
        with requests.Session() as session:
            urls = collect_profile_links(session, source.source_url, HEADERS, settings.request_timeout)
            if settings.crawl_limit:
                urls = urls[: settings.crawl_limit]
            run.found_count = len(urls)
            db.commit()
            for url in urls:
                key = profile_key(url)
                if key:
                    found_keys.add(key)
                try:
                    parsed = parse_person_profile(
                        session,
                        url,
                        HEADERS,
                        settings.request_timeout,
                        settings.parser_use_playwright,
                    )
                    if not parsed:
                        continue
                    _upsert_employee(db, run, parsed)
                    parsed_count += 1
                    run.parsed_count = parsed_count
                    db.commit()
                except Exception as exc:
                    run.error_count += 1
                    db.add(
                        CrawlError(
                            crawl_run_id=run.id,
                            profile_url=url,
                            error_type=type(exc).__name__,
                            message=str(exc),
                        )
                    )
                    db.commit()
                finally:
                    time.sleep(settings.request_delay_seconds)
        run.dismissed_count = _mark_dismissed(db, found_keys)
        run.status = "completed"
    except Exception as exc:
        run.status = "failed"
        run.message = str(exc)
    finally:
        run.finished_at = datetime.now(timezone.utc)
        db.commit()
        db.refresh(run)
    return run
 def _ensure_source(db: Session, source_url: str) -> ParserSource:
    source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url))
    if source:
        return source
    source = ParserSource(source_url=source_url, enabled=True)
    db.add(source)
    db.commit()
    db.refresh(source)
    return source
 def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
    html = parsed.pop("_html", None)
    checksum = _checksum(parsed)
    key = f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
    employee = db.scalar(select(Employee).where(Employee.profile_key == key))
    now = datetime.now(timezone.utc)
    if not employee:
        employee = Employee(
            profile_key=key,
            profile_type=parsed.get("profile_type"),
            profile_id=parsed.get("profile_id"),
            canonical_url=parsed["source_url"],
            first_seen_at=now,
        )
        db.add(employee)
        run.new_count += 1
    employee.full_name = parsed.get("full_name")
    employee.status = "active"
    employee.last_seen_at = now
    employee.dismissed_at = None
    employee.parser_version = parsed.get("parser_version")
    employee.current_data = parsed
    employee.current_checksum = checksum
    db.flush()
    db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
    for tab in parsed.get("tabs") or []:
        db.add(
            ProfileTab(
                employee_id=employee.id,
                title=tab.get("title") or "",
                href=tab.get("href") or "",
                data_index=tab.get("data_index"),
            )
        )
    db.add(
        EmployeeSnapshot(
            employee_id=employee.id,
            crawl_run_id=run.id,
            parsed_data=parsed,
            html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
            checksum=checksum,
            parser_version=parsed.get("parser_version"),
        )
    )
    return employee
 def _mark_dismissed(db: Session, found_keys: set[str]) -> int:
    dismissed = 0
    active = db.scalars(select(Employee).where(Employee.status == "active")).all()
    now = datetime.now(timezone.utc)
    for employee in active:
        if employee.profile_key in found_keys:
            continue
        employee.status = "dismissed"
        employee.dismissed_at = now
        dismissed += 1
    db.commit()
    return dismissed
 def _checksum(data: dict) -> str:
    payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
--- a/app/static/admin.css
+++ b/app/static/admin.css
@@ -0,0 +1,412 @@
 .admin {
  margin: 0;
  min-height: 100vh;
  color: #1f2937;
  background: #f6f7f9;
  font-family: Arial, sans-serif;
 }
 .admin__header {
  display: flex;
  align-items: center;
  justify-content: space-between;
  gap: 24px;
  padding: 18px 32px;
  background: #ffffff;
  border-bottom: 1px solid #d9dee7;
 }
 .admin__brand {
  margin: 0;
  font-size: 20px;
 }
 .admin__nav {
  display: flex;
  align-items: center;
  gap: 14px;
 }
 .admin__link {
  color: #0f766e;
  text-decoration: none;
  font-weight: 700;
 }
 .admin__main {
  width: min(1180px, calc(100% - 32px));
  margin: 28px auto;
 }
 .admin__footer {
  padding: 20px 32px;
  color: #6b7280;
  border-top: 1px solid #d9dee7;
  background: #ffffff;
 }
 .admin__grid {
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
  gap: 16px;
 }
 .metric {
  padding: 18px;
  background: #ffffff;
  border: 1px solid #d9dee7;
  border-radius: 8px;
 }
 .metric__label {
  color: #6b7280;
  font-size: 13px;
 }
 .metric__value {
  margin-top: 8px;
  font-size: 28px;
  font-weight: 700;
 }
 .panel {
  margin-top: 22px;
  padding: 20px;
  background: #ffffff;
  border: 1px solid #d9dee7;
  border-radius: 8px;
 }
 .panel__title {
  margin: 0 0 16px;
  font-size: 18px;
 }
 .table {
  width: 100%;
  border-collapse: collapse;
 }
 .table__cell,
 .table__head {
  padding: 10px 8px;
  border-bottom: 1px solid #e5e7eb;
  text-align: left;
  vertical-align: top;
 }
 .badge {
  display: inline-block;
  padding: 3px 8px;
  border-radius: 999px;
  background: #e0f2fe;
  color: #075985;
  font-size: 12px;
 }
 .badge--dismissed {
  background: #fee2e2;
  color: #991b1b;
 }
 .form {
  display: grid;
  gap: 12px;
  max-width: 380px;
 }
 .form__label {
  display: grid;
  gap: 6px;
  font-weight: 700;
 }
 .form__input,
 .form__select {
  padding: 10px 12px;
  border: 1px solid #cbd5e1;
  border-radius: 6px;
 }
 .button {
  padding: 10px 14px;
  border: 0;
  border-radius: 6px;
  color: #ffffff;
  background: #0f766e;
  font-weight: 700;
  cursor: pointer;
 }
 .button--ghost {
  color: #0f766e;
  background: transparent;
 }
 .code {
  overflow-x: auto;
  padding: 14px;
  background: #111827;
  color: #f9fafb;
  border-radius: 8px;
  white-space: pre-wrap;
 }
 .stats-strip {
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
  gap: 14px;
  margin-top: 16px;
 }
 .stats-strip__item {
  padding: 14px 16px;
  background: #ffffff;
  border: 1px solid #d9dee7;
  border-radius: 8px;
 }
 .stats-strip__label {
  display: block;
  color: #6b7280;
  font-size: 12px;
  text-transform: uppercase;
 }
 .stats-strip__value {
  display: block;
  margin-top: 6px;
  color: #1f2937;
  font-weight: 700;
 }
 .progress-panel {
  display: grid;
  gap: 12px;
 }
 .progress-panel__header {
  display: flex;
  align-items: center;
  justify-content: space-between;
  gap: 16px;
 }
 .progress-panel__body {
  display: grid;
  gap: 10px;
 }
 .progress-panel__meta {
  display: flex;
  flex-wrap: wrap;
  gap: 12px;
  color: #4b5563;
  font-size: 14px;
 }
 .progress-panel__percent {
  color: #0f766e;
  font-weight: 700;
 }
 .progress-panel__empty {
  margin: 0;
  color: #6b7280;
 }
 .progress-bar {
  height: 12px;
  overflow: hidden;
  background: #e5e7eb;
  border-radius: 999px;
 }
 .progress-bar__fill {
  height: 100%;
  width: 0;
  background: #0f766e;
  transition: width 0.25s ease;
 }
 .directory {
  display: grid;
  gap: 18px;
 }
 .directory__header {
  display: flex;
  align-items: end;
  justify-content: space-between;
  gap: 16px;
 }
 .directory__title {
  margin: 0;
  font-size: 24px;
 }
 .directory__summary {
  margin: 6px 0 0;
  color: #6b7280;
 }
 .directory__filters {
  display: grid;
  grid-template-columns: minmax(220px, 1.7fr) repeat(6, minmax(120px, 1fr));
  gap: 10px;
  padding: 16px;
  background: #ffffff;
  border: 1px solid #d9dee7;
  border-radius: 8px;
 }
 .directory__input {
  min-width: 0;
  padding: 10px 12px;
  border: 1px solid #cbd5e1;
  border-radius: 6px;
 }
 .directory__table-wrap {
  overflow-x: auto;
  background: #ffffff;
  border: 1px solid #d9dee7;
  border-radius: 8px;
 }
 .directory__pagination {
  display: flex;
  align-items: center;
  justify-content: center;
  gap: 16px;
 }
 .directory__page {
  color: #4b5563;
  font-weight: 700;
 }
 .directory-table {
  width: 100%;
  min-width: 1120px;
  border-collapse: collapse;
 }
 .directory-table__head {
  padding: 12px 10px;
  color: #374151;
  background: #f9fafb;
  border-bottom: 1px solid #e5e7eb;
  font-size: 13px;
  text-align: left;
  white-space: nowrap;
 }
 .directory-table__cell {
  max-width: 280px;
  padding: 12px 10px;
  border-bottom: 1px solid #e5e7eb;
  vertical-align: top;
 }
 .directory-table__row {
  cursor: pointer;
 }
 .directory-table__row:hover {
  background: #f0fdfa;
 }
 .directory-table__empty {
  padding: 28px;
  color: #6b7280;
  text-align: center;
 }
 .directory-table__cell--hidden,
 .directory-table__head--hidden {
  display: none;
 }
 .columns-modal {
  position: fixed;
  inset: 0;
  z-index: 50;
  display: grid;
  place-items: center;
  padding: 20px;
 }
 .columns-modal[hidden] {
  display: none;
 }
 .columns-modal__backdrop {
  position: absolute;
  inset: 0;
  background: rgba(17, 24, 39, 0.54);
 }
 .columns-modal__panel {
  position: relative;
  width: min(620px, 100%);
  max-height: min(720px, calc(100vh - 40px));
  overflow: auto;
  padding: 20px;
  background: #ffffff;
  border-radius: 8px;
  box-shadow: 0 24px 80px rgba(15, 23, 42, 0.22);
 }
 .columns-modal__header {
  display: flex;
  align-items: center;
  justify-content: space-between;
  gap: 12px;
 }
 .columns-modal__title {
  margin: 0;
  font-size: 18px;
 }
 .columns-modal__grid {
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
  gap: 10px;
  margin-top: 18px;
 }
 .columns-modal__option {
  display: flex;
  align-items: center;
  gap: 8px;
  padding: 10px;
  background: #f9fafb;
  border: 1px solid #e5e7eb;
  border-radius: 6px;
 }
 .columns-modal__checkbox {
  width: 16px;
  height: 16px;
 }
@media (max-width: 920px) {
  .directory__filters {
    grid-template-columns: 1fr 1fr;
  }
  .progress-panel__header,
  .directory__header {
    align-items: stretch;
    flex-direction: column;
  }
 }
@media (max-width: 620px) {
  .directory__filters {
    grid-template-columns: 1fr;
  }
 }
--- a/app/static/admin.js
+++ b/app/static/admin.js
@@ -0,0 +1,111 @@
 (function () {
  const columnDefaults = [
    "full_name",
    "status",
    "positions",
    "hse_start_year",
    "email",
    "last_seen_at",
    "dismissed_at",
    "profile",
  ];
  const storageKey = "miem.directory.columns";
  function readColumns() {
    try {
      const stored = JSON.parse(localStorage.getItem(storageKey) || "[]");
      return Array.isArray(stored) && stored.length ? stored : columnDefaults;
    } catch (_error) {
      return columnDefaults;
    }
  }
  function writeColumns(columns) {
    localStorage.setItem(storageKey, JSON.stringify(columns));
  }
  function applyColumns(columns) {
    document.querySelectorAll("[data-column]").forEach((node) => {
      const visible = columns.includes(node.dataset.column);
      node.classList.toggle("directory-table__cell--hidden", !visible && node.classList.contains("directory-table__cell"));
      node.classList.toggle("directory-table__head--hidden", !visible && node.classList.contains("directory-table__head"));
    });
    document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
      checkbox.checked = columns.includes(checkbox.value);
    });
  }
  function setupColumns() {
    if (!document.querySelector("[data-directory-table]")) return;
    let columns = readColumns();
    const modal = document.querySelector("[data-columns-modal]");
    applyColumns(columns);
    document.querySelectorAll("[data-columns-open]").forEach((button) => {
      button.addEventListener("click", () => {
        modal.hidden = false;
      });
    });
    document.querySelectorAll("[data-columns-close]").forEach((button) => {
      button.addEventListener("click", () => {
        modal.hidden = true;
      });
    });
    document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
      checkbox.addEventListener("change", () => {
        columns = Array.from(document.querySelectorAll("[data-column-toggle]:checked")).map((item) => item.value);
        if (!columns.length) columns = ["full_name"];
        writeColumns(columns);
        applyColumns(columns);
      });
    });
    document.querySelectorAll("[data-row-href]").forEach((row) => {
      row.addEventListener("click", (event) => {
        if (event.target.closest("a, button, input, select, label")) return;
        window.location.href = row.dataset.rowHref;
      });
    });
  }
  function setupProgress() {
    const panel = document.querySelector("[data-progress-panel]");
    if (!panel) return;
    const update = (run) => {
      if (!run) return;
      const status = document.querySelector("[data-progress-status]");
      const processed = document.querySelector("[data-progress-processed]");
      const found = document.querySelector("[data-progress-found]");
      const errors = document.querySelector("[data-progress-errors]");
      const fill = document.querySelector("[data-progress-fill]");
      const percent = document.querySelector("[data-progress-percent]");
      if (status) status.textContent = run.status;
      if (processed) processed.textContent = run.processed_count;
      if (found) found.textContent = run.found_count;
      if (errors) errors.textContent = run.error_count;
      if (fill) fill.style.width = `${run.progress_percent}%`;
      if (percent) percent.textContent = run.progress_percent;
    };
    const poll = async () => {
      try {
        const response = await fetch("/api/crawl-runs/latest", { credentials: "same-origin" });
        if (!response.ok) return false;
        const data = await response.json();
        const run = data.running || data.latest;
        update(run);
        return Boolean(data.running);
      } catch (_error) {
        return false;
      }
    };
    const interval = window.setInterval(async () => {
      const keepGoing = await poll();
      if (!keepGoing) window.clearInterval(interval);
    }, 4000);
  }
  setupColumns();
  setupProgress();
 })();
--- a/app/templates/base.html
+++ b/app/templates/base.html
@@ -0,0 +1,30 @@
 <!doctype html>
 <html lang="ru">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>{% block title %}MIEM Employees{% endblock %}</title>
    <link rel="stylesheet" href="/static/admin.css">
  </head>
  <body class="admin">
    <header class="admin__header">
      <h1 class="admin__brand">MIEM Employees</h1>
      <nav class="admin__nav">
        <a class="admin__link" href="/admin">Dashboard</a>
        <a class="admin__link" href="/admin/directory">Directory</a>
        <a class="admin__link" href="/admin/employees">Employees</a>
        <a class="admin__link" href="/admin/runs">Runs</a>
        <form method="post" action="/admin/logout">
          <button class="button button--ghost" type="submit">Logout</button>
        </form>
      </nav>
    </header>
    <main class="admin__main">
      {% block content %}{% endblock %}
    </main>
    <footer class="admin__footer">
      Backend {{ backend_version }} · Frontend {{ frontend_version }}
    </footer>
    {% block scripts %}{% endblock %}
  </body>
 </html>
--- a/app/templates/dashboard.html
+++ b/app/templates/dashboard.html
@@ -0,0 +1,62 @@
 {% extends "base.html" %}
 {% block title %}Dashboard · MIEM Employees{% endblock %}
 {% block content %}
 <section class="admin__grid">
  <div class="metric"><div class="metric__label">Total</div><div class="metric__value">{{ counts.total }}</div></div>
  <div class="metric"><div class="metric__label">Active</div><div class="metric__value">{{ counts.active }}</div></div>
  <div class="metric"><div class="metric__label">New in last run</div><div class="metric__value">{{ counts.new_in_last_run }}</div></div>
  <div class="metric"><div class="metric__label">Dismissed</div><div class="metric__value">{{ counts.dismissed }}</div></div>
 </section>
 <section class="stats-strip">
  <div class="stats-strip__item">
    <span class="stats-strip__label">Latest added</span>
    {% if counts.latest_added %}
    <a class="stats-strip__value" href="/admin/employees/{{ counts.latest_added.id }}">{{ counts.latest_added.full_name or counts.latest_added.canonical_url }}</a>
    {% else %}
    <span class="stats-strip__value">No employees yet</span>
    {% endif %}
  </div>
  <div class="stats-strip__item">
    <span class="stats-strip__label">Runs</span>
    <span class="stats-strip__value">{{ counts.runs }}</span>
  </div>
  <div class="stats-strip__item">
    <span class="stats-strip__label">Errors</span>
    <span class="stats-strip__value">{{ counts.errors }}</span>
  </div>
 </section>
 <section class="panel progress-panel" data-progress-panel>
  <div class="progress-panel__header">
    <h2 class="panel__title">Parsing progress</h2>
    <form method="post" action="/admin/crawl-now">
      <button class="button" type="submit">Start crawl now</button>
    </form>
  </div>
  {% set run = counts.current_running_run or latest_run %}
  <div class="progress-panel__body" data-progress-body>
    <div class="progress-panel__meta">
      <span data-progress-status>{{ run.status if run else "idle" }}</span>
      <span><span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span> processed</span>
      <span><span data-progress-errors>{{ run.error_count if run else 0 }}</span> errors</span>
    </div>
    <div class="progress-bar" aria-label="Parsing progress">
      <div class="progress-bar__fill" data-progress-fill style="width: {{ run.progress_percent if run else 0 }}%"></div>
    </div>
    <div class="progress-panel__percent"><span data-progress-percent>{{ run.progress_percent if run else 0 }}</span>%</div>
  </div>
 </section>
 <section class="panel">
  <h2 class="panel__title">Latest runs</h2>
  <table class="table">
    <thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Parsed</th><th class="table__head">Errors</th><th class="table__head">Started</th></tr></thead>
    <tbody>
      {% for run in runs %}
      <tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_at }}</td></tr>
      {% endfor %}
    </tbody>
  </table>
 </section>
 {% endblock %}
 {% block scripts %}
 <script src="/static/admin.js"></script>
 {% endblock %}
--- a/app/templates/directory.html
+++ b/app/templates/directory.html
@@ -0,0 +1,112 @@
 {% extends "base.html" %}
 {% block title %}Directory · MIEM Employees{% endblock %}
 {% block content %}
 <section class="directory">
  <div class="directory__header">
    <div>
      <h2 class="directory__title">Directory</h2>
      <p class="directory__summary">{{ page.total }} employees found</p>
    </div>
    <button class="button" type="button" data-columns-open>Columns</button>
  </div>
  <form class="directory__filters" method="get" action="/admin/directory">
    <input class="directory__input" name="q" value="{{ filters.q }}" placeholder="Name or URL">
    <select class="directory__input" name="status">
      <option value="" {% if not filters.status %}selected{% endif %}>All statuses</option>
      <option value="active" {% if filters.status == "active" %}selected{% endif %}>Active</option>
      <option value="dismissed" {% if filters.status == "dismissed" %}selected{% endif %}>Dismissed</option>
    </select>
    <select class="directory__input" name="has_email">
      <option value="" {% if not filters.has_email %}selected{% endif %}>Any email</option>
      <option value="true" {% if filters.has_email == "true" %}selected{% endif %}>Has email</option>
      <option value="false" {% if filters.has_email == "false" %}selected{% endif %}>No email</option>
    </select>
    <input class="directory__input" type="date" name="started_from" value="{{ filters.started_from }}" aria-label="First seen from">
    <input class="directory__input" type="date" name="started_to" value="{{ filters.started_to }}" aria-label="First seen to">
    <select class="directory__input" name="sort">
      {% for value, label in [("full_name", "Name"), ("status", "Status"), ("hse_start_year", "HSE start"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed")] %}
      <option value="{{ value }}" {% if filters.sort == value %}selected{% endif %}>Sort: {{ label }}</option>
      {% endfor %}
    </select>
    <select class="directory__input" name="direction">
      <option value="asc" {% if filters.direction == "asc" %}selected{% endif %}>Ascending</option>
      <option value="desc" {% if filters.direction == "desc" %}selected{% endif %}>Descending</option>
    </select>
    <button class="button" type="submit">Apply</button>
  </form>
  <div class="directory__table-wrap">
    <table class="directory-table" data-directory-table>
      <thead>
        <tr>
          <th class="directory-table__head" data-column="full_name">Name</th>
          <th class="directory-table__head" data-column="status">Status</th>
          <th class="directory-table__head" data-column="positions">Positions</th>
          <th class="directory-table__head" data-column="hse_start_year">HSE start</th>
          <th class="directory-table__head" data-column="email">Email</th>
          <th class="directory-table__head" data-column="phone">Phone</th>
          <th class="directory-table__head" data-column="address">Address</th>
          <th class="directory-table__head" data-column="publications_count">Publications</th>
          <th class="directory-table__head" data-column="courses_count">Courses</th>
          <th class="directory-table__head" data-column="first_seen_at">First seen</th>
          <th class="directory-table__head" data-column="last_seen_at">Last seen</th>
          <th class="directory-table__head" data-column="dismissed_at">Dismissed</th>
          <th class="directory-table__head" data-column="profile">Profile</th>
        </tr>
      </thead>
      <tbody>
        {% for employee in page.items %}
        <tr class="directory-table__row" data-row-href="/admin/employees/{{ employee.id }}">
          <td class="directory-table__cell" data-column="full_name">{{ employee.full_name or "No name" }}</td>
          <td class="directory-table__cell" data-column="status"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
          <td class="directory-table__cell" data-column="positions">{{ employee.positions_text }}</td>
          <td class="directory-table__cell" data-column="hse_start_year">{{ employee.hse_start_year or "" }}</td>
          <td class="directory-table__cell" data-column="email">{{ employee.email_text }}</td>
          <td class="directory-table__cell" data-column="phone">{{ employee.phone_text }}</td>
          <td class="directory-table__cell" data-column="address">{{ employee.address or "" }}</td>
          <td class="directory-table__cell" data-column="publications_count">{{ employee.publications_count }}</td>
          <td class="directory-table__cell" data-column="courses_count">{{ employee.courses_count }}</td>
          <td class="directory-table__cell" data-column="first_seen_at">{{ employee.first_seen_at or "" }}</td>
          <td class="directory-table__cell" data-column="last_seen_at">{{ employee.last_seen_at or "" }}</td>
          <td class="directory-table__cell" data-column="dismissed_at">{{ employee.dismissed_at or "" }}</td>
          <td class="directory-table__cell" data-column="profile"><a class="admin__link" href="{{ employee.canonical_url }}">Open</a></td>
        </tr>
        {% else %}
        <tr><td class="directory-table__empty" colspan="13">No employees match these filters.</td></tr>
        {% endfor %}
      </tbody>
    </table>
  </div>
  <div class="directory__pagination">
    {% set prev_offset = filters.offset - filters.limit %}
    {% set next_offset = filters.offset + filters.limit %}
    {% if filters.offset > 0 %}
    <a class="admin__link" href="{{ request.url.include_query_params(offset=prev_offset) }}">Previous</a>
    {% endif %}
    <span class="directory__page">Page {{ page.page }}{% if page.pages %} of {{ page.pages }}{% endif %}</span>
    {% if next_offset < page.total %}
    <a class="admin__link" href="{{ request.url.include_query_params(offset=next_offset) }}">Next</a>
    {% endif %}
  </div>
 </section>
 <div class="columns-modal" data-columns-modal hidden>
  <div class="columns-modal__backdrop" data-columns-close></div>
  <section class="columns-modal__panel" aria-label="Column settings">
    <div class="columns-modal__header">
      <h3 class="columns-modal__title">Visible columns</h3>
      <button class="button button--ghost" type="button" data-columns-close>Close</button>
    </div>
    <div class="columns-modal__grid">
      {% for key, label in [("full_name", "Name"), ("status", "Status"), ("positions", "Positions"), ("hse_start_year", "HSE start"), ("email", "Email"), ("phone", "Phone"), ("address", "Address"), ("publications_count", "Publications"), ("courses_count", "Courses"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed"), ("profile", "Profile")] %}
      <label class="columns-modal__option"><input class="columns-modal__checkbox" type="checkbox" value="{{ key }}" data-column-toggle> {{ label }}</label>
      {% endfor %}
    </div>
  </section>
 </div>
 {% endblock %}
 {% block scripts %}
 <script src="/static/admin.js"></script>
 {% endblock %}
--- a/app/templates/employee_detail.html
+++ b/app/templates/employee_detail.html
@@ -0,0 +1,28 @@
 {% extends "base.html" %}
 {% block title %}{{ employee.full_name }} · MIEM Employees{% endblock %}
 {% block content %}
 <section class="panel">
  <h2 class="panel__title">{{ employee.full_name or employee.profile_key }}</h2>
  <p><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></p>
  <p><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></p>
  <h3>Tabs</h3>
  <ul>
    {% for tab in employee.tabs %}
    <li><a class="admin__link" href="{{ tab.href }}">{{ tab.title }}</a></li>
    {% endfor %}
  </ul>
  <h3>Current data</h3>
  <pre class="code">{{ employee.current_data | tojson(indent=2) }}</pre>
 </section>
 <section class="panel">
  <h2 class="panel__title">Snapshots</h2>
  <table class="table">
    <thead><tr><th class="table__head">Captured</th><th class="table__head">Checksum</th><th class="table__head">Parser</th></tr></thead>
    <tbody>
      {% for snapshot in snapshots %}
      <tr><td class="table__cell">{{ snapshot.captured_at }}</td><td class="table__cell">{{ snapshot.checksum }}</td><td class="table__cell">{{ snapshot.parser_version }}</td></tr>
      {% endfor %}
    </tbody>
  </table>
 </section>
 {% endblock %}
--- a/app/templates/employees.html
+++ b/app/templates/employees.html
@@ -0,0 +1,29 @@
 {% extends "base.html" %}
 {% block title %}Employees · MIEM Employees{% endblock %}
 {% block content %}
 <section class="panel">
  <h2 class="panel__title">Employees</h2>
  <form class="form" method="get" action="/admin/employees">
    <input class="form__input" name="q" value="{{ q }}" placeholder="Name or URL">
    <select class="form__select" name="status">
      <option value="" {% if not status %}selected{% endif %}>All</option>
      <option value="active" {% if status == "active" %}selected{% endif %}>Active</option>
      <option value="dismissed" {% if status == "dismissed" %}selected{% endif %}>Dismissed</option>
    </select>
    <button class="button" type="submit">Search</button>
  </form>
  <table class="table">
    <thead><tr><th class="table__head">Name</th><th class="table__head">Status</th><th class="table__head">Last seen</th><th class="table__head">Profile</th></tr></thead>
    <tbody>
      {% for employee in employees %}
      <tr>
        <td class="table__cell"><a class="admin__link" href="/admin/employees/{{ employee.id }}">{{ employee.full_name or employee.profile_key }}</a></td>
        <td class="table__cell"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
        <td class="table__cell">{{ employee.last_seen_at }}</td>
        <td class="table__cell"><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></td>
      </tr>
      {% endfor %}
    </tbody>
  </table>
 </section>
 {% endblock %}
--- a/app/templates/login.html
+++ b/app/templates/login.html
@@ -0,0 +1,25 @@
 <!doctype html>
 <html lang="ru">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>Login · MIEM Employees</title>
    <link rel="stylesheet" href="/static/admin.css">
  </head>
  <body class="admin">
    <main class="admin__main">
      <section class="panel">
        <h1 class="panel__title">Admin login</h1>
        {% if error %}<p>{{ error }}</p>{% endif %}
        <form class="form" method="post" action="/admin/login">
          <label class="form__label">Login <input class="form__input" name="username" autocomplete="username"></label>
          <label class="form__label">Password <input class="form__input" name="password" type="password" autocomplete="current-password"></label>
          <button class="button" type="submit">Sign in</button>
        </form>
      </section>
    </main>
    <footer class="admin__footer">
      Backend {{ backend_version }} · Frontend {{ frontend_version }}
    </footer>
  </body>
 </html>
--- a/app/templates/runs.html
+++ b/app/templates/runs.html
@@ -0,0 +1,60 @@
 {% extends "base.html" %}
 {% block title %}Runs · MIEM Employees{% endblock %}
 {% block content %}
 <section class="panel">
  <div class="progress-panel__header">
    <h2 class="panel__title">Crawl runs</h2>
    <form method="post" action="/admin/runs"><button class="button" type="submit">Start crawl now</button></form>
  </div>
  {% set run = runs[0] if runs else none %}
  {% if run %}
  {% set processed = run.parsed_count + run.error_count %}
  {% set percent = ((processed / run.found_count) * 100) | round(1) if run.found_count else 0 %}
  <div class="progress-panel" data-progress-panel>
    <div class="progress-panel__meta">
      <span data-progress-status>{{ run.status }}</span>
      <span><span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span> processed</span>
      <span><span data-progress-errors>{{ run.error_count }}</span> errors</span>
    </div>
    <div class="progress-bar" aria-label="Parsing progress">
      <div class="progress-bar__fill" data-progress-fill style="width: {{ percent }}%"></div>
    </div>
  <div class="progress-panel__percent"><span data-progress-percent>{{ percent }}</span>%</div>
  </div>
  {% else %}
  <div class="progress-panel" data-progress-panel>
    <div class="progress-panel__meta">
      <span data-progress-status>idle</span>
      <span><span data-progress-processed>0</span> / <span data-progress-found>0</span> processed</span>
      <span><span data-progress-errors>0</span> errors</span>
    </div>
    <div class="progress-bar" aria-label="Parsing progress">
      <div class="progress-bar__fill" data-progress-fill style="width: 0%"></div>
    </div>
    <div class="progress-panel__percent"><span data-progress-percent>0</span>%</div>
  </div>
  {% endif %}
  <table class="table">
    <thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Found</th><th class="table__head">Parsed</th><th class="table__head">New</th><th class="table__head">Errors</th><th class="table__head">Dismissed</th></tr></thead>
    <tbody>
      {% for run in runs %}
      <tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td></tr>
      {% endfor %}
    </tbody>
  </table>
 </section>
 <section class="panel">
  <h2 class="panel__title">Recent errors</h2>
  <table class="table">
    <thead><tr><th class="table__head">Run</th><th class="table__head">Profile</th><th class="table__head">Error</th></tr></thead>
    <tbody>
      {% for error in errors %}
      <tr><td class="table__cell">{{ error.crawl_run_id }}</td><td class="table__cell">{{ error.profile_url }}</td><td class="table__cell">{{ error.error_type }}: {{ error.message }}</td></tr>
      {% endfor %}
    </tbody>
  </table>
 </section>
 {% endblock %}
 {% block scripts %}
 <script src="/static/admin.js"></script>
 {% endblock %}
--- a/app/version.py
+++ b/app/version.py
@@ -0,0 +1,3 @@
 APP_VERSION = "0.2.0"
 FRONTEND_VERSION = "0.2.0"
 BACKEND_VERSION = "0.2.0"
--- a/app/worker.py
+++ b/app/worker.py
@@ -0,0 +1,45 @@
 import logging
 import signal
 import time
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
 from app.config import get_settings
 from app.db import SessionLocal, init_db
 from app.services.crawler import run_crawl
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)
 def crawl_once() -> None:
    settings = get_settings()
    with SessionLocal() as db:
        run = run_crawl(db, settings)
        logger.info("crawl finished: id=%s status=%s parsed=%s errors=%s", run.id, run.status, run.parsed_count, run.error_count)
 def main() -> None:
    init_db()
    settings = get_settings()
    scheduler = BackgroundScheduler(timezone="Europe/Moscow")
    scheduler.add_job(crawl_once, CronTrigger.from_crontab(settings.crawl_cron), id="weekly_miem_crawl", replace_existing=True)
    scheduler.start()
    logger.info("worker started with cron=%s", settings.crawl_cron)
    stop = False
    def _stop(*_: object) -> None:
        nonlocal stop
        stop = True
    signal.signal(signal.SIGTERM, _stop)
    signal.signal(signal.SIGINT, _stop)
    while not stop:
        time.sleep(1)
    scheduler.shutdown()
 if __name__ == "__main__":
    main()
--- a/migrations/001_init.sql
+++ b/migrations/001_init.sql
@@ -0,0 +1,75 @@
 CREATE TABLE IF NOT EXISTS parser_sources (
  id SERIAL PRIMARY KEY,
  source_url TEXT NOT NULL UNIQUE,
  enabled BOOLEAN NOT NULL DEFAULT TRUE,
  created_at TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE TABLE IF NOT EXISTS crawl_runs (
  id SERIAL PRIMARY KEY,
  source_url TEXT NOT NULL,
  status VARCHAR(32) NOT NULL DEFAULT 'running',
  started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  finished_at TIMESTAMPTZ,
  found_count INTEGER NOT NULL DEFAULT 0,
  parsed_count INTEGER NOT NULL DEFAULT 0,
  new_count INTEGER NOT NULL DEFAULT 0,
  error_count INTEGER NOT NULL DEFAULT 0,
  dismissed_count INTEGER NOT NULL DEFAULT 0,
  message TEXT
 );
 CREATE TABLE IF NOT EXISTS employees (
  id SERIAL PRIMARY KEY,
  profile_key VARCHAR(255) NOT NULL UNIQUE,
  profile_type VARCHAR(50),
  profile_id VARCHAR(255),
  canonical_url TEXT NOT NULL,
  full_name TEXT,
  status VARCHAR(32) NOT NULL DEFAULT 'active',
  first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  last_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  dismissed_at TIMESTAMPTZ,
  parser_version VARCHAR(32),
  current_data JSONB,
  current_checksum VARCHAR(64),
  created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE INDEX IF NOT EXISTS ix_employees_full_name ON employees (full_name);
 CREATE INDEX IF NOT EXISTS ix_employees_status ON employees (status);
 CREATE TABLE IF NOT EXISTS employee_snapshots (
  id SERIAL PRIMARY KEY,
  employee_id INTEGER NOT NULL REFERENCES employees(id),
  crawl_run_id INTEGER REFERENCES crawl_runs(id),
  parsed_data JSONB NOT NULL,
  html_snapshot BYTEA,
  checksum VARCHAR(64) NOT NULL,
  parser_version VARCHAR(32),
  captured_at TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE INDEX IF NOT EXISTS ix_employee_snapshots_employee_id ON employee_snapshots (employee_id);
 CREATE TABLE IF NOT EXISTS crawl_errors (
  id SERIAL PRIMARY KEY,
  crawl_run_id INTEGER NOT NULL REFERENCES crawl_runs(id),
  profile_url TEXT,
  error_type VARCHAR(255) NOT NULL,
  message TEXT NOT NULL,
  created_at TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE INDEX IF NOT EXISTS ix_crawl_errors_run_id ON crawl_errors (crawl_run_id);
 CREATE TABLE IF NOT EXISTS profile_tabs (
  id SERIAL PRIMARY KEY,
  employee_id INTEGER NOT NULL REFERENCES employees(id),
  title TEXT NOT NULL,
  href TEXT NOT NULL,
  data_index VARCHAR(64)
 );
 CREATE INDEX IF NOT EXISTS ix_profile_tabs_employee_id ON profile_tabs (employee_id);
--- a/migrations/002_crawl_run_new_count.sql
+++ b/migrations/002_crawl_run_new_count.sql
@@ -0,0 +1,2 @@
 ALTER TABLE crawl_runs
 ADD COLUMN IF NOT EXISTS new_count INTEGER NOT NULL DEFAULT 0;
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,23 @@
 import pytest
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.pool import StaticPool
 from app.db import Base
@pytest.fixture()
 def db_session():
    engine = create_engine(
        "sqlite:///:memory:",
        connect_args={"check_same_thread": False},
        poolclass=StaticPool,
    )
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)
    session = Session()
    try:
        yield session
    finally:
        session.close()
        Base.metadata.drop_all(engine)
--- a/tests/test_admin_data.py
+++ b/tests/test_admin_data.py
@@ -0,0 +1,98 @@
 from datetime import datetime, timezone
 from app.models import CrawlRun, Employee
 from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
 def test_employee_display_payload_extracts_common_fields(db_session):
    employee = Employee(
        profile_key="staff:person",
        canonical_url="https://www.hse.ru/staff/person",
        full_name="Person Name",
        status="active",
        first_seen_at=datetime.now(timezone.utc),
        last_seen_at=datetime.now(timezone.utc),
        current_data={
            "positions": ["Professor"],
            "hse_start_year": 2024,
            "contacts": {"emails": ["person@hse.ru"], "phones": ["+79990000000"], "address": "Moscow"},
            "sections": [
                {"type": "publications", "publications": [{"title": "Paper"}]},
                {"type": "courses_by_year", "courses": [{"title": "Course"}]},
            ],
        },
    )
    payload = employee_display_payload(employee)
    assert payload["positions_text"] == "Professor"
    assert payload["email_text"] == "person@hse.ru"
    assert payload["publications_count"] == 1
    assert payload["courses_count"] == 1
 def test_list_employees_page_filters_sorts_and_paginates(db_session):
    db_session.add(
        Employee(
            profile_key="staff:b",
            canonical_url="https://www.hse.ru/staff/b",
            full_name="Beta",
            status="dismissed",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
            current_data={"contacts": {"emails": []}},
        )
    )
    db_session.add(
        Employee(
            profile_key="staff:a",
            canonical_url="https://www.hse.ru/staff/a",
            full_name="Alpha",
            status="active",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
            current_data={"contacts": {"emails": ["alpha@hse.ru"]}},
        )
    )
    db_session.commit()
    page = list_employees_page(db_session, status="active", sort="full_name", direction="asc", limit=10)
    assert page["total"] == 1
    assert page["items"][0]["full_name"] == "Alpha"
 def test_stats_payload_uses_latest_run_new_count(db_session):
    db_session.add(
        Employee(
            profile_key="staff:a",
            canonical_url="https://www.hse.ru/staff/a",
            full_name="Alpha",
            status="active",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
        )
    )
    db_session.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=3))
    db_session.commit()
    payload = stats_payload(db_session)
    assert payload["total"] == 1
    assert payload["active"] == 1
    assert payload["new_in_last_run"] == 3
 def test_run_payload_calculates_progress():
    run = CrawlRun(
        source_url="https://miem.hse.ru/persons",
        status="running",
        found_count=10,
        parsed_count=4,
        error_count=1,
    )
    payload = run_payload(run)
    assert payload["processed_count"] == 5
    assert payload["progress_percent"] == 50.0
--- a/tests/test_api_mcp.py
+++ b/tests/test_api_mcp.py
@@ -0,0 +1,159 @@
 from datetime import datetime, timezone
 from fastapi.testclient import TestClient
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.pool import StaticPool
 from app.config import Settings, get_settings
 from app.db import Base, get_db
 from app.main import app
 from app.models import CrawlRun, Employee
 from app.security import SESSION_COOKIE, sign_session
 def test_health_returns_versions():
    client = TestClient(app)
    response = client.get("/api/health")
    assert response.status_code == 200
    assert response.json()["backend_version"] == "0.2.0"
 def test_mcp_requires_token_and_lists_tools():
    engine = create_engine(
        "sqlite:///:memory:",
        connect_args={"check_same_thread": False},
        poolclass=StaticPool,
    )
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)
    def override_db():
        session = Session()
        try:
            yield session
        finally:
            session.close()
    app.dependency_overrides[get_db] = override_db
    app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
    client = TestClient(app)
    unauthorized = client.post("/mcp", json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})
    authorized = client.post(
        "/mcp",
        headers={"Authorization": "Bearer secret"},
        json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
    )
    assert unauthorized.status_code == 401
    assert authorized.status_code == 200
    assert authorized.json()["result"]["tools"][0]["name"] == "search_employees"
    app.dependency_overrides.clear()
 def test_mcp_search_employees_returns_matching_employee():
    engine = create_engine(
        "sqlite:///:memory:",
        connect_args={"check_same_thread": False},
        poolclass=StaticPool,
    )
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)
    session = Session()
    session.add(
        Employee(
            profile_key="staff:avsergeev",
            profile_type="staff",
            profile_id="avsergeev",
            canonical_url="https://www.hse.ru/staff/avsergeev",
            full_name="Сергеев Алексей Викторович",
            status="active",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
            current_data={"sections": []},
        )
    )
    session.commit()
    session.close()
    def override_db():
        db = Session()
        try:
            yield db
        finally:
            db.close()
    app.dependency_overrides[get_db] = override_db
    app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
    client = TestClient(app)
    response = client.post(
        "/mcp",
        headers={"Authorization": "Bearer secret"},
        json={
            "jsonrpc": "2.0",
            "id": 1,
            "method": "tools/call",
            "params": {"name": "search_employees", "arguments": {"query": "Сергеев"}},
        },
    )
    assert response.status_code == 200
    assert "Сергеев Алексей Викторович" in response.json()["result"]["content"][0]["text"]
    app.dependency_overrides.clear()
 def test_api_employees_and_stats_require_admin_session():
    engine = create_engine(
        "sqlite:///:memory:",
        connect_args={"check_same_thread": False},
        poolclass=StaticPool,
    )
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)
    db = Session()
    db.add(
        Employee(
            profile_key="staff:alpha",
            profile_type="staff",
            profile_id="alpha",
            canonical_url="https://www.hse.ru/staff/alpha",
            full_name="Alpha Person",
            status="active",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
            current_data={"contacts": {"emails": ["alpha@hse.ru"]}, "sections": []},
        )
    )
    db.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1))
    db.commit()
    db.close()
    settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
    def override_db():
        session = Session()
        try:
            yield session
        finally:
            session.close()
    app.dependency_overrides[get_db] = override_db
    app.dependency_overrides[get_settings] = lambda: settings
    client = TestClient(app)
    client.cookies.set(SESSION_COOKIE, sign_session("admin", settings))
    employees = client.get("/api/employees", params={"q": "Alpha", "has_email": True})
    stats = client.get("/api/stats")
    assert employees.status_code == 200
    assert employees.json()["total"] == 1
    assert stats.status_code == 200
    assert stats.json()["new_in_last_run"] == 1
    app.dependency_overrides.clear()
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -0,0 +1,58 @@
 from datetime import datetime, timezone
 from app.models import CrawlRun, Employee
 from app.services.crawler import _mark_dismissed, _upsert_employee
 def test_mark_dismissed_only_marks_missing_active_employees(db_session):
    db_session.add(
        Employee(
            profile_key="staff:kept",
            canonical_url="https://www.hse.ru/staff/kept",
            status="active",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
        )
    )
    db_session.add(
        Employee(
            profile_key="staff:gone",
            canonical_url="https://www.hse.ru/staff/gone",
            status="active",
            first_seen_at=datetime.now(timezone.utc),
            last_seen_at=datetime.now(timezone.utc),
        )
    )
    db_session.commit()
    dismissed = _mark_dismissed(db_session, {"staff:kept"})
    assert dismissed == 1
    assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
    gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
    assert gone.status == "dismissed"
    assert gone.dismissed_at is not None
 def test_upsert_employee_increments_new_count_for_new_employee(db_session):
    run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
    db_session.add(run)
    db_session.commit()
    _upsert_employee(
        db_session,
        run,
        {
            "source_url": "https://www.hse.ru/staff/newperson",
            "profile_type": "staff",
            "profile_id": "newperson",
            "full_name": "New Person",
            "tabs": [],
            "sections": [],
            "parser_version": "0.2.0",
            "_html": "<html></html>",
        },
    )
    db_session.commit()
    assert run.new_count == 1
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -0,0 +1,28 @@
 from bs4 import BeautifulSoup
 from app.parser.profile import extract_person_tabs
 from app.parser.profile_url import normalize_profile_url, parse_profile_identity
 def test_normalize_profile_url_supports_staff_and_org_persons():
    assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
    assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
    assert parse_profile_identity("https://www.hse.ru/staff/avsergeev") == ("staff", "avsergeev")
 def test_extract_person_tabs_prefers_person_menu_addition():
    soup = BeautifulSoup(
        """
        <div class="person-menu is-desktop small person-menu-addition">
          <a href="#main">Домашняя страница</a>
          <a href="#sci" data-index="1">Публикации</a>
        </div>
        <a href="/org/persons/999">Other person</a>
        """,
        "html.parser",
    )
    tabs = extract_person_tabs(soup, "https://www.hse.ru/staff/avsergeev")
    assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
    assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
Author	SHA1	Message	Date
Anton	4bd5f26469	feature: improve admin directory and crawl progress	2026-04-28 17:24:10 +03:00
admin	51d83d7062	Merge pull request 'feature: add MIEM employees parser service with admin UI and MCP' (#1 ) from feature/miem-employees-server into main Reviewed-on: #1	2026-04-28 13:21:21 +00:00
Anton	d512580960	feature: add MIEM employees parser service with admin UI and MCP	2026-04-28 16:20:51 +03:00
		`@@ -0,0 +1 @@`
							`"""HTML parsing helpers for HSE/MIEM employee pages."""`
		`@@ -0,0 +1,2 @@`
							`ALTER TABLE crawl_runs`
							`ADD COLUMN IF NOT EXISTS new_count INTEGER NOT NULL DEFAULT 0;`