feat: track crawl run employee changes and verify dismissals

This commit is contained in:
Anton
2026-05-06 15:13:15 +03:00
parent 2331c7a28d
commit d0459a2c30
16 changed files with 517 additions and 27 deletions

View File

@@ -8,7 +8,14 @@ from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlError, CrawlRun, Employee
from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
from app.services.admin_data import employee_detail_payload, format_admin_datetime, list_employees_page, run_payload, stats_payload
from app.services.admin_data import (
employee_detail_payload,
format_admin_datetime,
list_employees_page,
run_detail_payload,
run_payload,
stats_payload,
)
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
@@ -150,6 +157,20 @@ def runs(request: Request, db: Session = Depends(get_db), settings: Settings = D
return _render(request, "runs.html", {"runs": items, "errors": errors})
@router.get("/runs/{run_id}", response_class=HTMLResponse)
def run_detail(
run_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
run = db.get(CrawlRun, run_id)
if not run:
return RedirectResponse("/admin/runs", status_code=303)
return _render(request, "run_detail.html", {"run": run_detail_payload(db, run)})
@router.post("/runs")
def trigger_run(
request: Request,

View File

@@ -8,7 +8,7 @@ from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlRun, Employee
from app.security import require_admin
from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
from app.services.admin_data import employee_display_payload, list_employees_page, run_detail_payload, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
@@ -88,6 +88,20 @@ def latest_crawl_run(
return {"running": run_payload(running), "latest": run_payload(latest)}
@router.get("/crawl-runs/{run_id}")
def get_crawl_run(
run_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
run = db.get(CrawlRun, run_id)
if not run:
return {"error": "not_found"}
return run_detail_payload(db, run) or {"error": "not_found"}
@router.post("/crawl-runs")
def trigger_crawl(
request: Request,

View File

@@ -8,6 +8,8 @@ from app.config import Settings, get_settings
from app.db import get_db
from app.models import CrawlRun, Employee
from app.security import mcp_protected_resource_metadata, require_mcp_auth
from app.services.admin_data import run_detail_payload
from app.version import BACKEND_VERSION
router = APIRouter(prefix="/mcp")
metadata_router = APIRouter()
@@ -47,6 +49,15 @@ TOOLS = [
"description": "Return the latest crawl run status.",
"inputSchema": {"type": "object", "properties": {}},
},
{
"name": "get_crawl_run_details",
"description": "Return detailed employee changes and errors for one crawl run.",
"inputSchema": {
"type": "object",
"properties": {"run_id": {"type": "integer"}},
"required": ["run_id"],
},
},
]
@@ -66,7 +77,7 @@ async def mcp_http(
if method == "initialize":
result = {
"protocolVersion": "2024-11-05",
"serverInfo": {"name": "miem-employees", "version": "0.1.0"},
"serverInfo": {"name": "miem-employees", "version": BACKEND_VERSION},
"capabilities": {"tools": {}},
}
elif method == "tools/list":
@@ -95,6 +106,9 @@ def _call_tool(db: Session, name: str, arguments: dict) -> dict:
if name == "get_crawl_status":
run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return _tool_response(_run_payload(run) if run else {"status": "never_run"})
if name == "get_crawl_run_details":
run = db.get(CrawlRun, int(arguments["run_id"]))
return _tool_response(run_detail_payload(db, run) if run else {"error": "not_found"})
raise ValueError(f"Unknown tool: {name}")

View File

@@ -41,6 +41,7 @@ class Employee(Base):
snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee")
class EmployeeSnapshot(Base):
@@ -74,6 +75,31 @@ class CrawlRun(Base):
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
message: Mapped[str | None] = mapped_column(Text)
employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run")
class CrawlRunEmployeeChange(Base):
__tablename__ = "crawl_run_employee_changes"
__table_args__ = (
Index("ix_crawl_run_employee_changes_run_id", "crawl_run_id"),
Index("ix_crawl_run_employee_changes_employee_id", "employee_id"),
Index("ix_crawl_run_employee_changes_change_type", "change_type"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
profile_url: Mapped[str] = mapped_column(Text, nullable=False)
full_name: Mapped[str | None] = mapped_column(Text)
change_type: Mapped[str] = mapped_column(String(32), nullable=False)
profile_available: Mapped[bool | None] = mapped_column()
message: Mapped[str | None] = mapped_column(Text)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
crawl_run: Mapped[CrawlRun] = relationship(back_populates="employee_changes")
employee: Mapped[Employee | None] = relationship(back_populates="crawl_run_changes")
class CrawlError(Base):
__tablename__ = "crawl_errors"

View File

@@ -8,7 +8,7 @@ from zoneinfo import ZoneInfo
from sqlalchemy import Select, Text, and_, desc, func, or_, select
from sqlalchemy.orm import Session
from app.models import CrawlRun, Employee
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
EMPLOYEE_SORTS = {
"full_name": Employee.full_name,
@@ -175,6 +175,26 @@ def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
}
def run_detail_payload(db: Session, run: CrawlRun | None) -> dict[str, Any] | None:
if not run:
return None
changes = db.scalars(
select(CrawlRunEmployeeChange)
.where(CrawlRunEmployeeChange.crawl_run_id == run.id)
.order_by(CrawlRunEmployeeChange.created_at, CrawlRunEmployeeChange.id)
).all()
errors = db.scalars(select(CrawlError).where(CrawlError.crawl_run_id == run.id).order_by(CrawlError.created_at)).all()
grouped_changes = {"new": [], "missing_from_source": [], "dismissed": []}
for change in changes:
grouped_changes.setdefault(change.change_type, []).append(_change_payload(change))
return {
**(run_payload(run) or {}),
"changes_detail_available": bool(changes),
"changes": grouped_changes,
"errors": [_crawl_error_payload(error) for error in errors],
}
def format_admin_datetime(value: Any) -> str:
if not value:
return "Не указано"
@@ -200,6 +220,52 @@ def _run_status_display(status: str | None) -> str:
return labels.get(status or "", status or "Не указано")
def _change_payload(change: CrawlRunEmployeeChange) -> dict[str, Any]:
return {
"id": change.id,
"employee_id": change.employee_id,
"profile_key": change.profile_key,
"profile_url": change.profile_url,
"full_name": change.full_name,
"change_type": change.change_type,
"change_type_display": _change_type_display(change.change_type),
"profile_available": change.profile_available,
"profile_available_display": _profile_available_display(change.profile_available),
"message": change.message,
"created_at": change.created_at.isoformat() if change.created_at else None,
"created_display": format_admin_datetime(change.created_at),
}
def _crawl_error_payload(error: CrawlError) -> dict[str, Any]:
return {
"id": error.id,
"crawl_run_id": error.crawl_run_id,
"profile_url": error.profile_url,
"error_type": error.error_type,
"message": error.message,
"created_at": error.created_at.isoformat() if error.created_at else None,
"created_display": format_admin_datetime(error.created_at),
}
def _change_type_display(change_type: str | None) -> str:
labels = {
"new": "Новый",
"missing_from_source": "Потеряшка",
"dismissed": "Уволен",
}
return labels.get(change_type or "", change_type or "Не указано")
def _profile_available_display(value: bool | None) -> str:
if value is True:
return "Профиль доступен"
if value is False:
return "Профиль недоступен"
return "Не проверялось"
def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
total = 0
for section in sections:

View File

@@ -9,7 +9,7 @@ from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key
@@ -68,7 +68,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
finally:
time.sleep(settings.request_delay_seconds)
run.dismissed_count = _mark_dismissed(db, found_keys)
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.status = "completed"
except Exception as exc:
run.status = "failed"
@@ -107,6 +107,9 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
)
db.add(employee)
run.new_count += 1
is_new = True
else:
is_new = False
employee.full_name = parsed.get("full_name")
employee.status = "active"
@@ -117,6 +120,16 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
employee.current_checksum = checksum
db.flush()
if is_new:
_record_employee_change(
db,
run,
employee,
"new",
profile_available=True,
message="Сотрудник впервые найден в источнике.",
)
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
db.add(
@@ -141,20 +154,70 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
return employee
def _mark_dismissed(db: Session, found_keys: set[str]) -> int:
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
dismissed = 0
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
now = datetime.now(timezone.utc)
for employee in active:
if employee.profile_key in found_keys:
continue
profile_available = _profile_is_available(session, employee.canonical_url, timeout)
if profile_available:
_record_employee_change(
db,
run,
employee,
"missing_from_source",
profile_available=True,
message="Профиль доступен, но ссылка отсутствует в исходном списке.",
)
continue
employee.status = "dismissed"
employee.dismissed_at = now
_record_employee_change(
db,
run,
employee,
"dismissed",
profile_available=False,
message="Сотрудник отсутствует в исходном списке, профиль не подтвердился как доступный.",
)
dismissed += 1
db.commit()
return dismissed
def _profile_is_available(session: requests.Session, url: str, timeout: int) -> bool:
try:
response = session.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
return response.status_code < 400
except requests.RequestException:
return False
def _record_employee_change(
db: Session,
run: CrawlRun,
employee: Employee,
change_type: str,
*,
profile_available: bool | None,
message: str,
) -> None:
db.add(
CrawlRunEmployeeChange(
crawl_run_id=run.id,
employee_id=employee.id,
profile_key=employee.profile_key,
profile_url=employee.canonical_url,
full_name=employee.full_name,
change_type=change_type,
profile_available=profile_available,
message=message,
)
)
def _checksum(data: dict) -> str:
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()

View File

@@ -0,0 +1,64 @@
{% extends "base.html" %}
{% block title %}Запуск {{ run.id }} · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<div class="progress-panel__header">
<div>
<h2 class="panel__title">Запуск {{ run.id }}</h2>
<p class="progress-panel__empty">{{ run.started_display }} · {{ run.status_display }}</p>
</div>
<a class="admin__link" href="/admin/runs">Все запуски</a>
</div>
<div class="stats-strip">
<div class="stats-strip__item"><span class="stats-strip__label">Найдено</span><span class="stats-strip__value">{{ run.found_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Обработано</span><span class="stats-strip__value">{{ run.parsed_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Новые</span><span class="stats-strip__value">{{ run.new_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Потеряшки</span><span class="stats-strip__value">{{ run.changes.missing_from_source | length }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Уволены</span><span class="stats-strip__value">{{ run.dismissed_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Ошибки</span><span class="stats-strip__value">{{ run.error_count }}</span></div>
</div>
{% if not run.changes_detail_available %}
<p class="progress-panel__empty">Детализация сотрудников для этого запуска недоступна. Она сохраняется только для новых запусков после обновления.</p>
{% endif %}
</section>
{% for group, title in [("new", "Новые сотрудники"), ("missing_from_source", "Потеряшки"), ("dismissed", "Уволенные")] %}
<section class="panel">
<h2 class="panel__title">{{ title }}</h2>
{% set items = run.changes[group] %}
{% if items %}
<table class="table">
<thead><tr><th class="table__head">ФИО</th><th class="table__head">Профиль</th><th class="table__head">Проверка</th><th class="table__head">Комментарий</th></tr></thead>
<tbody>
{% for item in items %}
<tr>
<td class="table__cell">{% if item.employee_id %}<a class="admin__link" href="/admin/employees/{{ item.employee_id }}">{{ item.full_name or item.profile_key }}</a>{% else %}{{ item.full_name or item.profile_key }}{% endif %}</td>
<td class="table__cell"><a class="admin__link" href="{{ item.profile_url }}">{{ item.profile_url }}</a></td>
<td class="table__cell">{{ item.profile_available_display }}</td>
<td class="table__cell">{{ item.message or "" }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p class="progress-panel__empty">Нет записей.</p>
{% endif %}
</section>
{% endfor %}
<section class="panel">
<h2 class="panel__title">Ошибки запуска</h2>
{% if run.errors %}
<table class="table">
<thead><tr><th class="table__head">Профиль</th><th class="table__head">Ошибка</th><th class="table__head">Время</th></tr></thead>
<tbody>
{% for error in run.errors %}
<tr><td class="table__cell">{{ error.profile_url or "" }}</td><td class="table__cell">{{ error.error_type }}: {{ error.message }}</td><td class="table__cell">{{ error.created_display }}</td></tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p class="progress-panel__empty">Ошибок нет.</p>
{% endif %}
</section>
{% endblock %}

View File

@@ -38,7 +38,7 @@
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Найдено</th><th class="table__head">Обработано</th><th class="table__head">Новые</th><th class="table__head">Ошибки</th><th class="table__head">Уволены</th><th class="table__head">Старт</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
<tr><td class="table__cell"><a class="admin__link" href="/admin/runs/{{ run.id }}">{{ run.id }}</a></td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -1,3 +1,3 @@
APP_VERSION = "0.3.0"
FRONTEND_VERSION = "0.3.0"
BACKEND_VERSION = "0.3.0"
APP_VERSION = "0.4.0"
FRONTEND_VERSION = "0.4.0"
BACKEND_VERSION = "0.4.0"