Compare commits

..

7 Commits

22 changed files with 1672 additions and 77 deletions

View File

@@ -41,6 +41,13 @@ uvicorn app.main:app --reload
Админка: `http://localhost:8000/admin`. Админка: `http://localhost:8000/admin`.
В админке доступны:
- `Dashboard`: общая статистика, последний добавленный сотрудник, прогресс текущего/последнего парсинга и ручной запуск.
- `Directory`: настраиваемая таблица сотрудников с фильтрами, сортировкой, пагинацией и выбором колонок.
- `Employees`: простая legacy-таблица сотрудников.
- `Runs`: история запусков, ошибки и progress bar.
## Docker Compose ## Docker Compose
```bash ```bash
@@ -57,7 +64,7 @@ docker compose up --build
## Парсинг ## Парсинг
Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на странице `Runs` или через REST: Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на `Dashboard` и странице `Runs` или через REST:
```bash ```bash
curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=..." curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=..."
@@ -67,9 +74,12 @@ curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=.
- найденные сотрудники получают статус `active` и обновленный `last_seen_at`; - найденные сотрудники получают статус `active` и обновленный `last_seen_at`;
- новые сотрудники добавляются в `employees`; - новые сотрудники добавляются в `employees`;
- количество новых сотрудников за запуск сохраняется в `crawl_runs.new_count`;
- активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`; - активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`;
- каждый успешный разбор сохраняет запись в `employee_snapshots`. - каждый успешный разбор сохраняет запись в `employee_snapshots`.
Во время выполнения парсинга `found_count`, `parsed_count` и `error_count` обновляются в базе. Админка опрашивает `/api/crawl-runs/latest` и показывает прогресс как `parsed_count + error_count / found_count`.
## MCP ## MCP
Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>`. Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>`.
@@ -100,4 +110,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql
docker compose down docker compose down
``` ```
Версия сервиса: `0.1.0`. Админка всегда показывает версии backend и frontend в footer. Версия сервиса: `0.2.2`. Админка всегда показывает версии backend и frontend в footer.

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request, Response from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from sqlalchemy import desc, func, or_, select from sqlalchemy import desc, func, or_, select
@@ -8,7 +8,8 @@ from app.config import Settings, get_settings
from app.db import SessionLocal, get_db from app.db import SessionLocal, get_db
from app.models import CrawlError, CrawlRun, Employee from app.models import CrawlError, CrawlRun, Employee
from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
from app.services.crawler import run_crawl from app.services.admin_data import employee_detail_payload, list_employees_page, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/admin") router = APIRouter(prefix="/admin")
@@ -18,14 +19,11 @@ templates = Jinja2Templates(directory="app/templates")
@router.get("", response_class=HTMLResponse) @router.get("", response_class=HTMLResponse)
def dashboard(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)): def dashboard(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings) require_admin(request, settings)
counts = { counts = stats_payload(db)
"active": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "active")) or 0, counts["runs"] = db.scalar(select(func.count()).select_from(CrawlRun)) or 0
"dismissed": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "dismissed")) or 0, counts["errors"] = db.scalar(select(func.count()).select_from(CrawlError)) or 0
"runs": db.scalar(select(func.count()).select_from(CrawlRun)) or 0,
"errors": db.scalar(select(func.count()).select_from(CrawlError)) or 0,
}
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all() runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all()
return _render(request, "dashboard.html", {"counts": counts, "runs": runs}) return _render(request, "dashboard.html", {"counts": counts, "runs": runs, "latest_run": run_payload(runs[0]) if runs else None})
@router.get("/login", response_class=HTMLResponse) @router.get("/login", response_class=HTMLResponse)
@@ -35,7 +33,6 @@ def login_form(request: Request):
@router.post("/login") @router.post("/login")
def login( def login(
response: Response,
request: Request, request: Request,
username: str = Form(...), username: str = Form(...),
password: str = Form(...), password: str = Form(...),
@@ -74,6 +71,57 @@ def employees(
return _render(request, "employees.html", {"employees": items, "status": status or "", "q": q or ""}) return _render(request, "employees.html", {"employees": items, "status": status or "", "q": q or ""})
@router.get("/directory", response_class=HTMLResponse)
def directory(
request: Request,
status: str | None = None,
q: str | None = None,
started_from: str | None = None,
started_to: str | None = None,
has_email: str | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
parsed_started_from = _parse_date(started_from)
parsed_started_to = _parse_date(started_to)
parsed_has_email = None if has_email in (None, "") else has_email == "true"
page = list_employees_page(
db,
status=status,
q=q,
started_from=parsed_started_from,
started_to=parsed_started_to,
has_email=parsed_has_email,
sort=sort,
direction=direction,
limit=limit,
offset=offset,
)
return _render(
request,
"directory.html",
{
"page": page,
"filters": {
"status": status or "",
"q": q or "",
"started_from": started_from or "",
"started_to": started_to or "",
"has_email": has_email or "",
"sort": sort,
"direction": direction,
"limit": limit,
"offset": offset,
},
},
)
@router.get("/employees/{employee_id}", response_class=HTMLResponse) @router.get("/employees/{employee_id}", response_class=HTMLResponse)
def employee_detail( def employee_detail(
employee_id: int, employee_id: int,
@@ -86,7 +134,11 @@ def employee_detail(
if not employee: if not employee:
return RedirectResponse("/admin/employees", status_code=303) return RedirectResponse("/admin/employees", status_code=303)
snapshots = sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20] snapshots = sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
return _render(request, "employee_detail.html", {"employee": employee, "snapshots": snapshots}) return _render(
request,
"employee_detail.html",
{"employee": employee, "employee_view": employee_detail_payload(employee), "snapshots": snapshots},
)
@router.get("/runs", response_class=HTMLResponse) @router.get("/runs", response_class=HTMLResponse)
@@ -101,18 +153,40 @@ def runs(request: Request, db: Session = Depends(get_db), settings: Settings = D
def trigger_run( def trigger_run(
request: Request, request: Request,
background_tasks: BackgroundTasks, background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings), settings: Settings = Depends(get_settings),
): ):
require_admin(request, settings) require_admin(request, settings)
if get_running_run(db):
return RedirectResponse("/admin/runs", status_code=303)
def _crawl() -> None: def _crawl() -> None:
with SessionLocal() as db: with SessionLocal() as db:
run_crawl(db, settings) run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl) background_tasks.add_task(_crawl)
return RedirectResponse("/admin/runs", status_code=303) return RedirectResponse("/admin/runs", status_code=303)
@router.post("/crawl-now")
def crawl_now(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
if get_running_run(db):
return RedirectResponse("/admin", status_code=303)
def _crawl() -> None:
with SessionLocal() as db:
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return RedirectResponse("/admin", status_code=303)
def _render(request: Request, template: str, context: dict, status_code: int = 200) -> HTMLResponse: def _render(request: Request, template: str, context: dict, status_code: int = 200) -> HTMLResponse:
payload = { payload = {
"request": request, "request": request,
@@ -120,4 +194,15 @@ def _render(request: Request, template: str, context: dict, status_code: int = 2
"frontend_version": FRONTEND_VERSION, "frontend_version": FRONTEND_VERSION,
**context, **context,
} }
return templates.TemplateResponse(template, payload, status_code=status_code) return templates.TemplateResponse(request, template, payload, status_code=status_code)
def _parse_date(value: str | None):
if not value:
return None
try:
from datetime import date
return date.fromisoformat(value)
except ValueError:
return None

View File

@@ -1,12 +1,15 @@
from datetime import date
from fastapi import APIRouter, BackgroundTasks, Depends, Request from fastapi import APIRouter, BackgroundTasks, Depends, Request
from sqlalchemy import desc, or_, select from sqlalchemy import desc, select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.config import Settings, get_settings from app.config import Settings, get_settings
from app.db import SessionLocal, get_db from app.db import SessionLocal, get_db
from app.models import CrawlRun, Employee from app.models import CrawlRun, Employee
from app.security import require_admin from app.security import require_admin
from app.services.crawler import run_crawl from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/api") router = APIRouter(prefix="/api")
@@ -22,20 +25,29 @@ def list_employees(
request: Request, request: Request,
status: str | None = None, status: str | None = None,
q: str | None = None, q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50, limit: int = 50,
offset: int = 0, offset: int = 0,
db: Session = Depends(get_db), db: Session = Depends(get_db),
settings: Settings = Depends(get_settings), settings: Settings = Depends(get_settings),
) -> dict: ) -> dict:
require_admin(request, settings) require_admin(request, settings)
stmt = select(Employee) return list_employees_page(
if status: db,
stmt = stmt.where(Employee.status == status) status=status,
if q: q=q,
pattern = f"%{q}%" started_from=started_from,
stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern))) started_to=started_to,
employees = db.scalars(stmt.order_by(Employee.full_name).limit(limit).offset(offset)).all() has_email=has_email,
return {"items": [_employee_summary(item) for item in employees], "limit": limit, "offset": offset} sort=sort,
direction=direction,
limit=limit,
offset=offset,
)
@router.get("/employees/{employee_id}") @router.get("/employees/{employee_id}")
@@ -61,34 +73,53 @@ def list_crawl_runs(
) -> dict: ) -> dict:
require_admin(request, settings) require_admin(request, settings)
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(limit)).all() runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(limit)).all()
return {"items": [_run_summary(run) for run in runs]} return {"items": [run_payload(run) for run in runs]}
@router.get("/crawl-runs/latest")
def latest_crawl_run(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
running = get_running_run(db)
latest = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return {"running": run_payload(running), "latest": run_payload(latest)}
@router.post("/crawl-runs") @router.post("/crawl-runs")
def trigger_crawl( def trigger_crawl(
request: Request, request: Request,
background_tasks: BackgroundTasks, background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings), settings: Settings = Depends(get_settings),
) -> dict: ) -> dict:
require_admin(request, settings) require_admin(request, settings)
running = get_running_run(db)
if running:
return {"status": "already_running", "run": run_payload(running)}
def _crawl() -> None: def _crawl() -> None:
with SessionLocal() as db: with SessionLocal() as db:
run_crawl(db, settings) run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl) background_tasks.add_task(_crawl)
return {"status": "scheduled"} return {"status": "scheduled"}
@router.get("/stats")
def stats(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
return stats_payload(db)
def _employee_summary(employee: Employee) -> dict: def _employee_summary(employee: Employee) -> dict:
return { return employee_display_payload(employee)
"id": employee.id,
"full_name": employee.full_name,
"status": employee.status,
"canonical_url": employee.canonical_url,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
}
def _employee_detail(employee: Employee) -> dict: def _employee_detail(employee: Employee) -> dict:
@@ -99,15 +130,4 @@ def _employee_detail(employee: Employee) -> dict:
def _run_summary(run: CrawlRun) -> dict: def _run_summary(run: CrawlRun) -> dict:
return { return run_payload(run) or {}
"id": run.id,
"source_url": run.source_url,
"status": run.status,
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"error_count": run.error_count,
"dismissed_count": run.dismissed_count,
"message": run.message,
}

View File

@@ -69,6 +69,7 @@ class CrawlRun(Base):
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
message: Mapped[str | None] = mapped_column(Text) message: Mapped[str | None] = mapped_column(Text)

319
app/services/admin_data.py Normal file
View File

@@ -0,0 +1,319 @@
from __future__ import annotations
from datetime import date, datetime, time
from math import ceil
from typing import Any
from sqlalchemy import Select, Text, and_, desc, func, or_, select
from sqlalchemy.orm import Session
from app.models import CrawlRun, Employee
EMPLOYEE_SORTS = {
"full_name": Employee.full_name,
"status": Employee.status,
"first_seen_at": Employee.first_seen_at,
"last_seen_at": Employee.last_seen_at,
"dismissed_at": Employee.dismissed_at,
"hse_start_year": Employee.current_data["hse_start_year"].as_integer(),
}
def employee_display_payload(employee: Employee) -> dict[str, Any]:
data = employee.current_data or {}
contacts = data.get("contacts") or {}
sections = data.get("sections") or []
emails = contacts.get("emails") or []
phones = contacts.get("phones") or []
return {
"id": employee.id,
"full_name": employee.full_name,
"status": employee.status,
"canonical_url": employee.canonical_url,
"positions": data.get("positions") or [],
"positions_text": "; ".join(data.get("positions") or []),
"hse_start_year": data.get("hse_start_year"),
"emails": emails,
"email_text": ", ".join(emails),
"phones": phones,
"phone_text": ", ".join(phones),
"address": contacts.get("address"),
"publications_count": _count_section_items(sections, "publications"),
"courses_count": _count_section_items(sections, "courses_by_year"),
"first_seen_at": employee.first_seen_at.isoformat() if employee.first_seen_at else None,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
}
def employee_detail_payload(employee: Employee) -> dict[str, Any]:
data = employee.current_data or {}
contacts = data.get("contacts") or {}
return {
**employee_display_payload(employee),
"profile_type": employee.profile_type or data.get("profile_type"),
"profile_id": employee.profile_id or data.get("profile_id"),
"parser_version": employee.parser_version or data.get("parser_version"),
"contacts": {
"emails": _clean_list(contacts.get("emails")),
"phones": _clean_list(contacts.get("phones")),
"address": contacts.get("address"),
"items": _normalize_contact_items(contacts.get("items")),
},
"external_ids": _normalize_external_ids(data.get("external_ids")),
"sections": [_normalize_section(section) for section in data.get("sections") or []],
}
def build_employee_query(
*,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
) -> Select[tuple[Employee]]:
stmt = select(Employee)
filters = []
if status:
filters.append(Employee.status == status)
if q:
pattern = f"%{q}%"
filters.append(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
if started_from:
filters.append(Employee.first_seen_at >= datetime.combine(started_from, time.min))
if started_to:
filters.append(Employee.first_seen_at <= datetime.combine(started_to, time.max))
if has_email is True:
filters.append(Employee.current_data.cast(Text).ilike("%@%"))
elif has_email is False:
filters.append(or_(Employee.current_data.is_(None), ~Employee.current_data.cast(Text).ilike("%@%")))
if filters:
stmt = stmt.where(and_(*filters))
return stmt
def list_employees_page(
db: Session,
*,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
) -> dict[str, Any]:
limit = max(1, min(limit, 200))
offset = max(0, offset)
base_stmt = build_employee_query(
status=status,
q=q,
started_from=started_from,
started_to=started_to,
has_email=has_email,
)
total = db.scalar(select(func.count()).select_from(base_stmt.subquery())) or 0
sort_column = EMPLOYEE_SORTS.get(sort, Employee.full_name)
order = desc(sort_column) if direction == "desc" else sort_column
employees = db.scalars(base_stmt.order_by(order).limit(limit).offset(offset)).all()
return {
"items": [employee_display_payload(employee) for employee in employees],
"total": total,
"limit": limit,
"offset": offset,
"pages": ceil(total / limit) if total else 0,
"page": (offset // limit) + 1,
}
def stats_payload(db: Session) -> dict[str, Any]:
latest_run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
running_run = db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
latest_added = db.scalar(select(Employee).order_by(desc(Employee.first_seen_at)).limit(1))
return {
"total": db.scalar(select(func.count()).select_from(Employee)) or 0,
"active": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "active")) or 0,
"dismissed": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "dismissed")) or 0,
"new_in_last_run": latest_run.new_count if latest_run else 0,
"latest_added": employee_display_payload(latest_added) if latest_added else None,
"latest_run": run_payload(latest_run) if latest_run else None,
"current_running_run": run_payload(running_run) if running_run else None,
}
def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
if not run:
return None
processed = run.parsed_count + run.error_count
percent = round((processed / run.found_count) * 100, 1) if run.found_count else 0
return {
"id": run.id,
"source_url": run.source_url,
"status": run.status,
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"new_count": run.new_count,
"error_count": run.error_count,
"dismissed_count": run.dismissed_count,
"processed_count": processed,
"progress_percent": percent,
"message": run.message,
}
def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
total = 0
for section in sections:
if section.get("type") != section_type:
continue
if section_type == "publications":
total += len(section.get("publications") or section.get("items") or [])
elif section_type == "courses_by_year":
total += len(section.get("courses") or [])
return total
def _clean_list(values: Any) -> list[str]:
if not isinstance(values, list):
return []
return [str(value).strip() for value in values if str(value or "").strip()]
def _normalize_contact_items(items: Any) -> list[str]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if isinstance(item, dict):
value = item.get("raw") or item.get("value") or item.get("text")
else:
value = item
value = str(value or "").strip()
if value:
normalized.append(value)
return normalized
def _normalize_external_ids(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
system = str(item.get("system") or "").strip()
value = str(item.get("value") or "").strip()
url = str(item.get("url") or "").strip()
if system or value or url:
normalized.append({"system": system or "ID", "value": value or url, "url": url or None})
return normalized
def _normalize_section(section: Any) -> dict[str, Any]:
if not isinstance(section, dict):
return {"title": "Раздел", "type": "generic", "paragraphs": [str(section)], "items": [], "links": []}
section_type = section.get("type") or "generic"
paragraphs = _clean_list(section.get("paragraphs"))
items = _clean_list(section.get("items"))
raw_text = str(section.get("raw_text") or "").strip()
if not paragraphs and not items and raw_text:
paragraphs = [raw_text]
return {
"title": section.get("title") or "Раздел",
"type": section_type,
"raw_text": raw_text,
"paragraphs": paragraphs,
"items": items,
"links": _normalize_links(section.get("links")),
"year_entries": _normalize_year_entries(section.get("year_entries")),
"publications": _normalize_publications(section.get("publications")),
"publications_count": section.get("publications_count"),
"academic_year": section.get("academic_year"),
"courses": _normalize_courses(section.get("courses")),
"table": _normalize_table(section.get("table")),
}
def _normalize_links(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
text = str(item.get("text") or item.get("url") or "").strip()
url = str(item.get("url") or "").strip()
if text and url:
normalized.append({"text": text, "url": url})
return normalized
def _normalize_year_entries(items: Any) -> list[dict[str, Any]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
text = str(item.get("text") or "").strip()
if text:
normalized.append({"year": item.get("year"), "text": text, "links": _normalize_links(item.get("links"))})
return normalized
def _normalize_publications(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
text = str(item or "").strip()
if text:
normalized.append({"title": text, "text": text, "url": None})
continue
title = str(item.get("title") or "").strip()
text = str(item.get("text") or title).strip()
url = str(item.get("url") or "").strip()
if title or text:
normalized.append({"title": title or text, "text": text or title, "url": url or None})
return normalized
def _normalize_courses(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
title = str(item or "").strip()
if title:
normalized.append({"title": title, "url": None})
continue
title = str(item.get("title") or "").strip()
url = str(item.get("url") or "").strip()
if title or url:
normalized.append({"title": title or url, "url": url or None})
return normalized
def _normalize_table(table: Any) -> dict[str, Any] | None:
if not isinstance(table, dict):
return None
headers = _clean_list(table.get("headers"))
rows = []
for row in table.get("rows") or []:
if not isinstance(row, dict):
continue
cells = _clean_list(row.get("cells"))
if cells:
rows.append({"cells": cells, "link_url": row.get("link_url")})
if not headers and not rows:
return None
return {"headers": headers, "rows": rows}

View File

@@ -0,0 +1,17 @@
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlRun
from app.services.crawler import run_crawl
def get_running_run(db: Session) -> CrawlRun | None:
return db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
def run_crawl_if_idle(db: Session, settings: Settings) -> tuple[CrawlRun, bool]:
running = get_running_run(db)
if running:
return running, False
return run_crawl(db, settings), True

View File

@@ -106,6 +106,7 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
first_seen_at=now, first_seen_at=now,
) )
db.add(employee) db.add(employee)
run.new_count += 1
employee.full_name = parsed.get("full_name") employee.full_name = parsed.get("full_name")
employee.status = "active" employee.status = "active"

View File

@@ -151,3 +151,422 @@
border-radius: 8px; border-radius: 8px;
white-space: pre-wrap; white-space: pre-wrap;
} }
.employee-card {
display: grid;
gap: 18px;
}
.employee-card__header {
display: flex;
align-items: flex-start;
justify-content: space-between;
gap: 18px;
padding: 22px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.employee-card__identity {
display: grid;
gap: 10px;
}
.employee-card__title {
margin: 0;
font-size: 24px;
}
.employee-card__section {
padding: 20px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.employee-card__meta {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
gap: 14px;
margin: 0;
}
.employee-card__meta-item {
min-width: 0;
}
.employee-card__meta-item--wide {
grid-column: 1 / -1;
}
.employee-card__meta-label {
margin-bottom: 5px;
color: #6b7280;
font-size: 12px;
font-weight: 700;
text-transform: uppercase;
}
.employee-card__meta-value {
margin: 0;
color: #1f2937;
line-height: 1.45;
}
.employee-card__list {
display: grid;
gap: 8px;
margin: 0;
padding-left: 18px;
}
.employee-card__list-item {
line-height: 1.45;
}
.employee-card__sections {
display: grid;
gap: 14px;
}
.employee-section {
padding: 16px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 8px;
}
.employee-section__header {
display: flex;
align-items: flex-start;
justify-content: space-between;
gap: 12px;
margin-bottom: 12px;
}
.employee-section__title {
margin: 0;
font-size: 17px;
}
.employee-section__type {
flex: 0 0 auto;
padding: 3px 8px;
color: #475569;
background: #e2e8f0;
border-radius: 999px;
font-size: 12px;
}
.employee-section__note {
margin: 0 0 10px;
color: #4b5563;
font-weight: 700;
}
.employee-section__text {
margin: 0 0 10px;
line-height: 1.55;
}
.employee-section__table-wrap {
overflow-x: auto;
}
.employee-section__table {
width: 100%;
border-collapse: collapse;
background: #ffffff;
}
.employee-section__head,
.employee-section__cell {
padding: 10px;
border-bottom: 1px solid #e5e7eb;
text-align: left;
vertical-align: top;
}
.employee-section__head {
color: #374151;
background: #f3f4f6;
font-size: 13px;
}
.employee-section__links {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 12px;
}
.employee-section__link {
padding: 5px 9px;
color: #0f766e;
background: #ccfbf1;
border-radius: 999px;
font-size: 12px;
font-weight: 700;
text-decoration: none;
}
.stats-strip {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
gap: 14px;
margin-top: 16px;
}
.stats-strip__item {
padding: 14px 16px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.stats-strip__label {
display: block;
color: #6b7280;
font-size: 12px;
text-transform: uppercase;
}
.stats-strip__value {
display: block;
margin-top: 6px;
color: #1f2937;
font-weight: 700;
}
.progress-panel {
display: grid;
gap: 12px;
}
.progress-panel__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 16px;
}
.progress-panel__body {
display: grid;
gap: 10px;
}
.progress-panel__meta {
display: flex;
flex-wrap: wrap;
gap: 12px;
color: #4b5563;
font-size: 14px;
}
.progress-panel__percent {
color: #0f766e;
font-weight: 700;
}
.progress-panel__empty {
margin: 0;
color: #6b7280;
}
.progress-bar {
height: 12px;
overflow: hidden;
background: #e5e7eb;
border-radius: 999px;
}
.progress-bar__fill {
height: 100%;
width: 0;
background: #0f766e;
transition: width 0.25s ease;
}
.directory {
display: grid;
gap: 18px;
}
.directory__header {
display: flex;
align-items: end;
justify-content: space-between;
gap: 16px;
}
.directory__title {
margin: 0;
font-size: 24px;
}
.directory__summary {
margin: 6px 0 0;
color: #6b7280;
}
.directory__filters {
display: grid;
grid-template-columns: minmax(220px, 1.7fr) repeat(6, minmax(120px, 1fr));
gap: 10px;
padding: 16px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.directory__input {
min-width: 0;
padding: 10px 12px;
border: 1px solid #cbd5e1;
border-radius: 6px;
}
.directory__table-wrap {
overflow-x: auto;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.directory__pagination {
display: flex;
align-items: center;
justify-content: center;
gap: 16px;
}
.directory__page {
color: #4b5563;
font-weight: 700;
}
.directory-table {
width: 100%;
min-width: 1120px;
border-collapse: collapse;
}
.directory-table__head {
padding: 12px 10px;
color: #374151;
background: #f9fafb;
border-bottom: 1px solid #e5e7eb;
font-size: 13px;
text-align: left;
white-space: nowrap;
}
.directory-table__cell {
max-width: 280px;
padding: 12px 10px;
border-bottom: 1px solid #e5e7eb;
vertical-align: top;
}
.directory-table__row {
cursor: pointer;
}
.directory-table__row:hover {
background: #f0fdfa;
}
.directory-table__empty {
padding: 28px;
color: #6b7280;
text-align: center;
}
.directory-table__cell--hidden,
.directory-table__head--hidden {
display: none;
}
.columns-modal {
position: fixed;
inset: 0;
z-index: 50;
display: grid;
place-items: center;
padding: 20px;
}
.columns-modal[hidden] {
display: none;
}
.columns-modal__backdrop {
position: absolute;
inset: 0;
background: rgba(17, 24, 39, 0.54);
}
.columns-modal__panel {
position: relative;
width: min(620px, 100%);
max-height: min(720px, calc(100vh - 40px));
overflow: auto;
padding: 20px;
background: #ffffff;
border-radius: 8px;
box-shadow: 0 24px 80px rgba(15, 23, 42, 0.22);
}
.columns-modal__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
}
.columns-modal__title {
margin: 0;
font-size: 18px;
}
.columns-modal__grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 10px;
margin-top: 18px;
}
.columns-modal__option {
display: flex;
align-items: center;
gap: 8px;
padding: 10px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 6px;
}
.columns-modal__checkbox {
width: 16px;
height: 16px;
}
@media (max-width: 920px) {
.directory__filters {
grid-template-columns: 1fr 1fr;
}
.progress-panel__header,
.directory__header,
.employee-card__header {
align-items: stretch;
flex-direction: column;
}
}
@media (max-width: 620px) {
.directory__filters {
grid-template-columns: 1fr;
}
}

111
app/static/admin.js Normal file
View File

@@ -0,0 +1,111 @@
(function () {
const columnDefaults = [
"full_name",
"status",
"positions",
"hse_start_year",
"email",
"last_seen_at",
"dismissed_at",
"profile",
];
const storageKey = "miem.directory.columns";
function readColumns() {
try {
const stored = JSON.parse(localStorage.getItem(storageKey) || "[]");
return Array.isArray(stored) && stored.length ? stored : columnDefaults;
} catch (_error) {
return columnDefaults;
}
}
function writeColumns(columns) {
localStorage.setItem(storageKey, JSON.stringify(columns));
}
function applyColumns(columns) {
document.querySelectorAll("[data-column]").forEach((node) => {
const visible = columns.includes(node.dataset.column);
node.classList.toggle("directory-table__cell--hidden", !visible && node.classList.contains("directory-table__cell"));
node.classList.toggle("directory-table__head--hidden", !visible && node.classList.contains("directory-table__head"));
});
document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
checkbox.checked = columns.includes(checkbox.value);
});
}
function setupColumns() {
if (!document.querySelector("[data-directory-table]")) return;
let columns = readColumns();
const modal = document.querySelector("[data-columns-modal]");
applyColumns(columns);
document.querySelectorAll("[data-columns-open]").forEach((button) => {
button.addEventListener("click", () => {
modal.hidden = false;
});
});
document.querySelectorAll("[data-columns-close]").forEach((button) => {
button.addEventListener("click", () => {
modal.hidden = true;
});
});
document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
checkbox.addEventListener("change", () => {
columns = Array.from(document.querySelectorAll("[data-column-toggle]:checked")).map((item) => item.value);
if (!columns.length) columns = ["full_name"];
writeColumns(columns);
applyColumns(columns);
});
});
document.querySelectorAll("[data-row-href]").forEach((row) => {
row.addEventListener("click", (event) => {
if (event.target.closest("a, button, input, select, label")) return;
window.location.href = row.dataset.rowHref;
});
});
}
function setupProgress() {
const panel = document.querySelector("[data-progress-panel]");
if (!panel) return;
const update = (run) => {
if (!run) return;
const status = document.querySelector("[data-progress-status]");
const processed = document.querySelector("[data-progress-processed]");
const found = document.querySelector("[data-progress-found]");
const errors = document.querySelector("[data-progress-errors]");
const fill = document.querySelector("[data-progress-fill]");
const percent = document.querySelector("[data-progress-percent]");
if (status) status.textContent = run.status;
if (processed) processed.textContent = run.processed_count;
if (found) found.textContent = run.found_count;
if (errors) errors.textContent = run.error_count;
if (fill) fill.style.width = `${run.progress_percent}%`;
if (percent) percent.textContent = run.progress_percent;
};
const poll = async () => {
try {
const response = await fetch("/api/crawl-runs/latest", { credentials: "same-origin" });
if (!response.ok) return false;
const data = await response.json();
const run = data.running || data.latest;
update(run);
return Boolean(data.running);
} catch (_error) {
return false;
}
};
const interval = window.setInterval(async () => {
const keepGoing = await poll();
if (!keepGoing) window.clearInterval(interval);
}, 4000);
}
setupColumns();
setupProgress();
})();

View File

@@ -11,6 +11,7 @@
<h1 class="admin__brand">MIEM Employees</h1> <h1 class="admin__brand">MIEM Employees</h1>
<nav class="admin__nav"> <nav class="admin__nav">
<a class="admin__link" href="/admin">Dashboard</a> <a class="admin__link" href="/admin">Dashboard</a>
<a class="admin__link" href="/admin/directory">Directory</a>
<a class="admin__link" href="/admin/employees">Employees</a> <a class="admin__link" href="/admin/employees">Employees</a>
<a class="admin__link" href="/admin/runs">Runs</a> <a class="admin__link" href="/admin/runs">Runs</a>
<form method="post" action="/admin/logout"> <form method="post" action="/admin/logout">
@@ -24,5 +25,6 @@
<footer class="admin__footer"> <footer class="admin__footer">
Backend {{ backend_version }} · Frontend {{ frontend_version }} Backend {{ backend_version }} · Frontend {{ frontend_version }}
</footer> </footer>
{% block scripts %}{% endblock %}
</body> </body>
</html> </html>

View File

@@ -2,10 +2,48 @@
{% block title %}Dashboard · MIEM Employees{% endblock %} {% block title %}Dashboard · MIEM Employees{% endblock %}
{% block content %} {% block content %}
<section class="admin__grid"> <section class="admin__grid">
<div class="metric"><div class="metric__label">Total</div><div class="metric__value">{{ counts.total }}</div></div>
<div class="metric"><div class="metric__label">Active</div><div class="metric__value">{{ counts.active }}</div></div> <div class="metric"><div class="metric__label">Active</div><div class="metric__value">{{ counts.active }}</div></div>
<div class="metric"><div class="metric__label">New in last run</div><div class="metric__value">{{ counts.new_in_last_run }}</div></div>
<div class="metric"><div class="metric__label">Dismissed</div><div class="metric__value">{{ counts.dismissed }}</div></div> <div class="metric"><div class="metric__label">Dismissed</div><div class="metric__value">{{ counts.dismissed }}</div></div>
<div class="metric"><div class="metric__label">Runs</div><div class="metric__value">{{ counts.runs }}</div></div> </section>
<div class="metric"><div class="metric__label">Errors</div><div class="metric__value">{{ counts.errors }}</div></div> <section class="stats-strip">
<div class="stats-strip__item">
<span class="stats-strip__label">Latest added</span>
{% if counts.latest_added %}
<a class="stats-strip__value" href="/admin/employees/{{ counts.latest_added.id }}">{{ counts.latest_added.full_name or counts.latest_added.canonical_url }}</a>
{% else %}
<span class="stats-strip__value">No employees yet</span>
{% endif %}
</div>
<div class="stats-strip__item">
<span class="stats-strip__label">Runs</span>
<span class="stats-strip__value">{{ counts.runs }}</span>
</div>
<div class="stats-strip__item">
<span class="stats-strip__label">Errors</span>
<span class="stats-strip__value">{{ counts.errors }}</span>
</div>
</section>
<section class="panel progress-panel" data-progress-panel>
<div class="progress-panel__header">
<h2 class="panel__title">Parsing progress</h2>
<form method="post" action="/admin/crawl-now">
<button class="button" type="submit">Start crawl now</button>
</form>
</div>
{% set run = counts.current_running_run or latest_run %}
<div class="progress-panel__body" data-progress-body>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status if run else "idle" }}</span>
<span><span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span> processed</span>
<span><span data-progress-errors>{{ run.error_count if run else 0 }}</span> errors</span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ run.progress_percent if run else 0 }}%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>{{ run.progress_percent if run else 0 }}</span>%</div>
</div>
</section> </section>
<section class="panel"> <section class="panel">
<h2 class="panel__title">Latest runs</h2> <h2 class="panel__title">Latest runs</h2>
@@ -19,3 +57,6 @@
</table> </table>
</section> </section>
{% endblock %} {% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -0,0 +1,112 @@
{% extends "base.html" %}
{% block title %}Directory · MIEM Employees{% endblock %}
{% block content %}
<section class="directory">
<div class="directory__header">
<div>
<h2 class="directory__title">Directory</h2>
<p class="directory__summary">{{ page.total }} employees found</p>
</div>
<button class="button" type="button" data-columns-open>Columns</button>
</div>
<form class="directory__filters" method="get" action="/admin/directory">
<input class="directory__input" name="q" value="{{ filters.q }}" placeholder="Name or URL">
<select class="directory__input" name="status">
<option value="" {% if not filters.status %}selected{% endif %}>All statuses</option>
<option value="active" {% if filters.status == "active" %}selected{% endif %}>Active</option>
<option value="dismissed" {% if filters.status == "dismissed" %}selected{% endif %}>Dismissed</option>
</select>
<select class="directory__input" name="has_email">
<option value="" {% if not filters.has_email %}selected{% endif %}>Any email</option>
<option value="true" {% if filters.has_email == "true" %}selected{% endif %}>Has email</option>
<option value="false" {% if filters.has_email == "false" %}selected{% endif %}>No email</option>
</select>
<input class="directory__input" type="date" name="started_from" value="{{ filters.started_from }}" aria-label="First seen from">
<input class="directory__input" type="date" name="started_to" value="{{ filters.started_to }}" aria-label="First seen to">
<select class="directory__input" name="sort">
{% for value, label in [("full_name", "Name"), ("status", "Status"), ("hse_start_year", "HSE start"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed")] %}
<option value="{{ value }}" {% if filters.sort == value %}selected{% endif %}>Sort: {{ label }}</option>
{% endfor %}
</select>
<select class="directory__input" name="direction">
<option value="asc" {% if filters.direction == "asc" %}selected{% endif %}>Ascending</option>
<option value="desc" {% if filters.direction == "desc" %}selected{% endif %}>Descending</option>
</select>
<button class="button" type="submit">Apply</button>
</form>
<div class="directory__table-wrap">
<table class="directory-table" data-directory-table>
<thead>
<tr>
<th class="directory-table__head" data-column="full_name">Name</th>
<th class="directory-table__head" data-column="status">Status</th>
<th class="directory-table__head" data-column="positions">Positions</th>
<th class="directory-table__head" data-column="hse_start_year">HSE start</th>
<th class="directory-table__head" data-column="email">Email</th>
<th class="directory-table__head" data-column="phone">Phone</th>
<th class="directory-table__head" data-column="address">Address</th>
<th class="directory-table__head" data-column="publications_count">Publications</th>
<th class="directory-table__head" data-column="courses_count">Courses</th>
<th class="directory-table__head" data-column="first_seen_at">First seen</th>
<th class="directory-table__head" data-column="last_seen_at">Last seen</th>
<th class="directory-table__head" data-column="dismissed_at">Dismissed</th>
<th class="directory-table__head" data-column="profile">Profile</th>
</tr>
</thead>
<tbody>
{% for employee in page.items %}
<tr class="directory-table__row" data-row-href="/admin/employees/{{ employee.id }}">
<td class="directory-table__cell" data-column="full_name">{{ employee.full_name or "No name" }}</td>
<td class="directory-table__cell" data-column="status"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
<td class="directory-table__cell" data-column="positions">{{ employee.positions_text }}</td>
<td class="directory-table__cell" data-column="hse_start_year">{{ employee.hse_start_year or "" }}</td>
<td class="directory-table__cell" data-column="email">{{ employee.email_text }}</td>
<td class="directory-table__cell" data-column="phone">{{ employee.phone_text }}</td>
<td class="directory-table__cell" data-column="address">{{ employee.address or "" }}</td>
<td class="directory-table__cell" data-column="publications_count">{{ employee.publications_count }}</td>
<td class="directory-table__cell" data-column="courses_count">{{ employee.courses_count }}</td>
<td class="directory-table__cell" data-column="first_seen_at">{{ employee.first_seen_at or "" }}</td>
<td class="directory-table__cell" data-column="last_seen_at">{{ employee.last_seen_at or "" }}</td>
<td class="directory-table__cell" data-column="dismissed_at">{{ employee.dismissed_at or "" }}</td>
<td class="directory-table__cell" data-column="profile"><a class="admin__link" href="{{ employee.canonical_url }}">Open</a></td>
</tr>
{% else %}
<tr><td class="directory-table__empty" colspan="13">No employees match these filters.</td></tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="directory__pagination">
{% set prev_offset = filters.offset - filters.limit %}
{% set next_offset = filters.offset + filters.limit %}
{% if filters.offset > 0 %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=prev_offset) }}">Previous</a>
{% endif %}
<span class="directory__page">Page {{ page.page }}{% if page.pages %} of {{ page.pages }}{% endif %}</span>
{% if next_offset < page.total %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=next_offset) }}">Next</a>
{% endif %}
</div>
</section>
<div class="columns-modal" data-columns-modal hidden>
<div class="columns-modal__backdrop" data-columns-close></div>
<section class="columns-modal__panel" aria-label="Column settings">
<div class="columns-modal__header">
<h3 class="columns-modal__title">Visible columns</h3>
<button class="button button--ghost" type="button" data-columns-close>Close</button>
</div>
<div class="columns-modal__grid">
{% for key, label in [("full_name", "Name"), ("status", "Status"), ("positions", "Positions"), ("hse_start_year", "HSE start"), ("email", "Email"), ("phone", "Phone"), ("address", "Address"), ("publications_count", "Publications"), ("courses_count", "Courses"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed"), ("profile", "Profile")] %}
<label class="columns-modal__option"><input class="columns-modal__checkbox" type="checkbox" value="{{ key }}" data-column-toggle> {{ label }}</label>
{% endfor %}
</div>
</section>
</div>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -1,19 +1,192 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block title %}{{ employee.full_name }} · MIEM Employees{% endblock %} {% block title %}{{ employee_view.full_name }} · MIEM Employees{% endblock %}
{% block content %} {% block content %}
<section class="panel"> <section class="employee-card">
<h2 class="panel__title">{{ employee.full_name or employee.profile_key }}</h2> <div class="employee-card__header">
<p><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></p> <div class="employee-card__identity">
<p><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></p> <h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2>
<h3>Tabs</h3> <span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status }}</span>
<ul> </div>
{% for tab in employee.tabs %} <a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a>
<li><a class="admin__link" href="{{ tab.href }}">{{ tab.title }}</a></li> </div>
{% endfor %}
</ul> <section class="employee-card__section">
<h3>Current data</h3> <h3 class="employee-section__title">Основная информация</h3>
<pre class="code">{{ employee.current_data | tojson(indent=2) }}</pre> <dl class="employee-card__meta">
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Должности</dt>
<dd class="employee-card__meta-value">
{% if employee_view.positions %}
<ul class="employee-card__list">
{% for position in employee_view.positions %}
<li class="employee-card__list-item">{{ position }}</li>
{% endfor %}
</ul>
{% else %}
Не указано
{% endif %}
</dd>
</div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Год начала работы в ВШЭ</dt><dd class="employee-card__meta-value">{{ employee_view.hse_start_year or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Profile type</dt><dd class="employee-card__meta-value">{{ employee_view.profile_type or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Profile ID</dt><dd class="employee-card__meta-value">{{ employee_view.profile_id or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">First seen</dt><dd class="employee-card__meta-value">{{ employee_view.first_seen_at or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Last seen</dt><dd class="employee-card__meta-value">{{ employee_view.last_seen_at or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Dismissed at</dt><dd class="employee-card__meta-value">{{ employee_view.dismissed_at or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Parser version</dt><dd class="employee-card__meta-value">{{ employee_view.parser_version or "Не указано" }}</dd></div>
</dl>
</section>
<section class="employee-card__section">
<h3 class="employee-section__title">Контакты</h3>
<dl class="employee-card__meta">
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Email</dt>
<dd class="employee-card__meta-value">
{% if employee_view.contacts.emails %}
<ul class="employee-card__list">
{% for email in employee_view.contacts.emails %}
<li class="employee-card__list-item"><a class="admin__link" href="mailto:{{ email }}">{{ email }}</a></li>
{% endfor %}
</ul>
{% else %}
Не указано
{% endif %}
</dd>
</div>
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Телефоны</dt>
<dd class="employee-card__meta-value">{{ employee_view.contacts.phones | join(", ") if employee_view.contacts.phones else "Не указано" }}</dd>
</div>
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Адрес</dt>
<dd class="employee-card__meta-value">{{ employee_view.contacts.address or "Не указано" }}</dd>
</div>
{% if employee_view.contacts.items %}
<div class="employee-card__meta-item employee-card__meta-item--wide">
<dt class="employee-card__meta-label">Прочее</dt>
<dd class="employee-card__meta-value">
<ul class="employee-card__list">
{% for item in employee_view.contacts.items %}
<li class="employee-card__list-item">{{ item }}</li>
{% endfor %}
</ul>
</dd>
</div>
{% endif %}
</dl>
</section>
{% if employee_view.external_ids %}
<section class="employee-card__section">
<h3 class="employee-section__title">Внешние идентификаторы</h3>
<ul class="employee-card__list">
{% for external_id in employee_view.external_ids %}
<li class="employee-card__list-item">
<strong>{{ external_id.system }}:</strong>
{% if external_id.url %}
<a class="admin__link" href="{{ external_id.url }}">{{ external_id.value }}</a>
{% else %}
{{ external_id.value }}
{% endif %}
</li>
{% endfor %}
</ul>
</section>
{% endif %}
<section class="employee-card__section">
<h3 class="employee-section__title">Разделы профиля</h3>
{% if employee_view.sections %}
<div class="employee-card__sections">
{% for section in employee_view.sections %}
<article class="employee-section">
<div class="employee-section__header">
<h4 class="employee-section__title">{{ section.title }}</h4>
<span class="employee-section__type">{{ section.type }}</span>
</div>
{% if section.type == "year_blocks" and section.year_entries %}
<ul class="employee-card__list">
{% for entry in section.year_entries %}
<li class="employee-card__list-item">{% if entry.year %}<strong>{{ entry.year }}:</strong> {% endif %}{{ entry.text }}</li>
{% endfor %}
</ul>
{% elif section.type == "publications" and section.publications %}
{% if section.publications_count %}<p class="employee-section__note">Всего: {{ section.publications_count }}</p>{% endif %}
<ul class="employee-card__list">
{% for publication in section.publications %}
<li class="employee-card__list-item">
{% if publication.url %}
<a class="admin__link" href="{{ publication.url }}">{{ publication.title }}</a>
{% else %}
{{ publication.title }}
{% endif %}
{% if publication.text and publication.text != publication.title %}<div class="employee-section__text">{{ publication.text }}</div>{% endif %}
</li>
{% endfor %}
</ul>
{% elif section.type == "courses_by_year" and section.courses %}
{% if section.academic_year %}<p class="employee-section__note">Учебный год: {{ section.academic_year }}</p>{% endif %}
<ul class="employee-card__list">
{% for course in section.courses %}
<li class="employee-card__list-item">
{% if course.url %}
<a class="admin__link" href="{{ course.url }}">{{ course.title }}</a>
{% else %}
{{ course.title }}
{% endif %}
</li>
{% endfor %}
</ul>
{% elif section.type == "table" and section.table %}
<div class="employee-section__table-wrap">
<table class="employee-section__table">
{% if section.table.headers %}
<thead><tr>{% for header in section.table.headers %}<th class="employee-section__head">{{ header }}</th>{% endfor %}</tr></thead>
{% endif %}
<tbody>
{% for row in section.table.rows %}
<tr>
{% for cell in row.cells %}
<td class="employee-section__cell">{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
{% if section.paragraphs %}
{% for paragraph in section.paragraphs %}
<p class="employee-section__text">{{ paragraph }}</p>
{% endfor %}
{% endif %}
{% if section.items %}
<ul class="employee-card__list">
{% for item in section.items %}
<li class="employee-card__list-item">{{ item }}</li>
{% endfor %}
</ul>
{% endif %}
{% endif %}
{% if section.links and section.type not in ["courses_by_year"] %}
<div class="employee-section__links">
{% for link in section.links %}
<a class="employee-section__link" href="{{ link.url }}">{{ link.text }}</a>
{% endfor %}
</div>
{% endif %}
</article>
{% endfor %}
</div>
{% else %}
<p class="employee-section__text">Разделы профиля не найдены.</p>
{% endif %}
</section>
</section> </section>
<section class="panel"> <section class="panel">
<h2 class="panel__title">Snapshots</h2> <h2 class="panel__title">Snapshots</h2>
<table class="table"> <table class="table">

View File

@@ -2,13 +2,43 @@
{% block title %}Runs · MIEM Employees{% endblock %} {% block title %}Runs · MIEM Employees{% endblock %}
{% block content %} {% block content %}
<section class="panel"> <section class="panel">
<h2 class="panel__title">Crawl runs</h2> <div class="progress-panel__header">
<form method="post" action="/admin/runs"><button class="button" type="submit">Start crawl</button></form> <h2 class="panel__title">Crawl runs</h2>
<form method="post" action="/admin/runs"><button class="button" type="submit">Start crawl now</button></form>
</div>
{% set run = runs[0] if runs else none %}
{% if run %}
{% set processed = run.parsed_count + run.error_count %}
{% set percent = ((processed / run.found_count) * 100) | round(1) if run.found_count else 0 %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status }}</span>
<span><span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span> processed</span>
<span><span data-progress-errors>{{ run.error_count }}</span> errors</span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ percent }}%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>{{ percent }}</span>%</div>
</div>
{% else %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>idle</span>
<span><span data-progress-processed>0</span> / <span data-progress-found>0</span> processed</span>
<span><span data-progress-errors>0</span> errors</span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: 0%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>0</span>%</div>
</div>
{% endif %}
<table class="table"> <table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Found</th><th class="table__head">Parsed</th><th class="table__head">Errors</th><th class="table__head">Dismissed</th></tr></thead> <thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Found</th><th class="table__head">Parsed</th><th class="table__head">New</th><th class="table__head">Errors</th><th class="table__head">Dismissed</th></tr></thead>
<tbody> <tbody>
{% for run in runs %} {% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td></tr> <tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td></tr>
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
@@ -25,3 +55,6 @@
</table> </table>
</section> </section>
{% endblock %} {% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -1,3 +1,3 @@
APP_VERSION = "0.1.0" APP_VERSION = "0.2.2"
FRONTEND_VERSION = "0.1.0" FRONTEND_VERSION = "0.2.2"
BACKEND_VERSION = "0.1.0" BACKEND_VERSION = "0.2.2"

View File

@@ -7,8 +7,6 @@ services:
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-miem_password} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-miem_password}
volumes: volumes:
- postgres_data:/var/lib/postgresql/data - postgres_data:/var/lib/postgresql/data
ports:
- "${POSTGRES_PORT:-5432}:5432"
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-miem} -d ${POSTGRES_DB:-miem_workers}"] test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-miem} -d ${POSTGRES_DB:-miem_workers}"]
interval: 10s interval: 10s
@@ -22,7 +20,7 @@ services:
environment: environment:
DATABASE_URL: postgresql+psycopg://${POSTGRES_USER:-miem}:${POSTGRES_PASSWORD:-miem_password}@postgres:5432/${POSTGRES_DB:-miem_workers} DATABASE_URL: postgresql+psycopg://${POSTGRES_USER:-miem}:${POSTGRES_PASSWORD:-miem_password}@postgres:5432/${POSTGRES_DB:-miem_workers}
ports: ports:
- "${API_PORT:-8000}:8000" - "127.0.0.1:8000:8000"
depends_on: depends_on:
postgres: postgres:
condition: service_healthy condition: service_healthy
@@ -44,7 +42,7 @@ services:
environment: environment:
DATABASE_URL: postgresql+psycopg://${POSTGRES_USER:-miem}:${POSTGRES_PASSWORD:-miem_password}@postgres:5432/${POSTGRES_DB:-miem_workers} DATABASE_URL: postgresql+psycopg://${POSTGRES_USER:-miem}:${POSTGRES_PASSWORD:-miem_password}@postgres:5432/${POSTGRES_DB:-miem_workers}
ports: ports:
- "${MCP_PORT:-8001}:8000" - "127.0.0.1:8001:8000"
depends_on: depends_on:
postgres: postgres:
condition: service_healthy condition: service_healthy

View File

@@ -13,6 +13,7 @@ CREATE TABLE IF NOT EXISTS crawl_runs (
finished_at TIMESTAMPTZ, finished_at TIMESTAMPTZ,
found_count INTEGER NOT NULL DEFAULT 0, found_count INTEGER NOT NULL DEFAULT 0,
parsed_count INTEGER NOT NULL DEFAULT 0, parsed_count INTEGER NOT NULL DEFAULT 0,
new_count INTEGER NOT NULL DEFAULT 0,
error_count INTEGER NOT NULL DEFAULT 0, error_count INTEGER NOT NULL DEFAULT 0,
dismissed_count INTEGER NOT NULL DEFAULT 0, dismissed_count INTEGER NOT NULL DEFAULT 0,
message TEXT message TEXT

View File

@@ -0,0 +1,2 @@
ALTER TABLE crawl_runs
ADD COLUMN IF NOT EXISTS new_count INTEGER NOT NULL DEFAULT 0;

161
tests/test_admin_data.py Normal file
View File

@@ -0,0 +1,161 @@
from datetime import datetime, timezone
from app.models import CrawlRun, Employee
from app.services.admin_data import (
employee_detail_payload,
employee_display_payload,
list_employees_page,
run_payload,
stats_payload,
)
def test_employee_display_payload_extracts_common_fields(db_session):
employee = Employee(
profile_key="staff:person",
canonical_url="https://www.hse.ru/staff/person",
full_name="Person Name",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={
"positions": ["Professor"],
"hse_start_year": 2024,
"contacts": {"emails": ["person@hse.ru"], "phones": ["+79990000000"], "address": "Moscow"},
"sections": [
{"type": "publications", "publications": [{"title": "Paper"}]},
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
],
},
)
payload = employee_display_payload(employee)
assert payload["positions_text"] == "Professor"
assert payload["email_text"] == "person@hse.ru"
assert payload["publications_count"] == 1
assert payload["courses_count"] == 1
def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
employee = Employee(
profile_key="staff:person",
profile_type="staff",
profile_id="person",
canonical_url="https://www.hse.ru/staff/person",
full_name="Person Name",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={
"positions": ["Professor"],
"hse_start_year": 2024,
"contacts": {
"emails": ["person@hse.ru"],
"phones": ["+79990000000"],
"address": "Moscow",
"items": [{"raw": "consultation hours"}],
},
"external_ids": [{"system": "ORCID", "value": "0000", "url": "https://orcid.org/0000"}],
"sections": [
{
"title": "Education",
"type": "year_blocks",
"year_entries": [{"year": 2020, "text": "Master degree"}],
},
{
"title": "Publications",
"type": "publications",
"publications": [{"title": "Paper", "text": "Paper details", "url": "https://example.test/paper"}],
},
{
"title": "Courses",
"type": "courses_by_year",
"academic_year": "2025/2026",
"courses": [{"title": "Course", "url": "https://example.test/course"}],
},
{
"title": "Fallback",
"type": "generic",
"raw_text": "Fallback text",
},
],
},
)
payload = employee_detail_payload(employee)
assert payload["contacts"]["emails"] == ["person@hse.ru"]
assert payload["contacts"]["items"] == ["consultation hours"]
assert payload["external_ids"][0]["system"] == "ORCID"
assert payload["sections"][0]["year_entries"][0]["text"] == "Master degree"
assert payload["sections"][1]["publications"][0]["title"] == "Paper"
assert payload["sections"][2]["courses"][0]["title"] == "Course"
assert payload["sections"][3]["paragraphs"] == ["Fallback text"]
def test_list_employees_page_filters_sorts_and_paginates(db_session):
db_session.add(
Employee(
profile_key="staff:b",
canonical_url="https://www.hse.ru/staff/b",
full_name="Beta",
status="dismissed",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": []}},
)
)
db_session.add(
Employee(
profile_key="staff:a",
canonical_url="https://www.hse.ru/staff/a",
full_name="Alpha",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": ["alpha@hse.ru"]}},
)
)
db_session.commit()
page = list_employees_page(db_session, status="active", sort="full_name", direction="asc", limit=10)
assert page["total"] == 1
assert page["items"][0]["full_name"] == "Alpha"
def test_stats_payload_uses_latest_run_new_count(db_session):
db_session.add(
Employee(
profile_key="staff:a",
canonical_url="https://www.hse.ru/staff/a",
full_name="Alpha",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=3))
db_session.commit()
payload = stats_payload(db_session)
assert payload["total"] == 1
assert payload["active"] == 1
assert payload["new_in_last_run"] == 3
def test_run_payload_calculates_progress():
run = CrawlRun(
source_url="https://miem.hse.ru/persons",
status="running",
found_count=10,
parsed_count=4,
error_count=1,
)
payload = run_payload(run)
assert payload["processed_count"] == 5
assert payload["progress_percent"] == 50.0

View File

@@ -8,7 +8,8 @@ from sqlalchemy.pool import StaticPool
from app.config import Settings, get_settings from app.config import Settings, get_settings
from app.db import Base, get_db from app.db import Base, get_db
from app.main import app from app.main import app
from app.models import Employee from app.models import CrawlRun, Employee
from app.security import SESSION_COOKIE, sign_session
def test_health_returns_versions(): def test_health_returns_versions():
@@ -17,7 +18,7 @@ def test_health_returns_versions():
response = client.get("/api/health") response = client.get("/api/health")
assert response.status_code == 200 assert response.status_code == 200
assert response.json()["backend_version"] == "0.1.0" assert response.json()["backend_version"] == "0.2.2"
def test_mcp_requires_token_and_lists_tools(): def test_mcp_requires_token_and_lists_tools():
@@ -105,3 +106,54 @@ def test_mcp_search_employees_returns_matching_employee():
assert "Сергеев Алексей Викторович" in response.json()["result"]["content"][0]["text"] assert "Сергеев Алексей Викторович" in response.json()["result"]["content"][0]["text"]
app.dependency_overrides.clear() app.dependency_overrides.clear()
def test_api_employees_and_stats_require_admin_session():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()
db.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": ["alpha@hse.ru"]}, "sections": []},
)
)
db.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1))
db.commit()
db.close()
settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
def override_db():
session = Session()
try:
yield session
finally:
session.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: settings
client = TestClient(app)
client.cookies.set(SESSION_COOKIE, sign_session("admin", settings))
employees = client.get("/api/employees", params={"q": "Alpha", "has_email": True})
stats = client.get("/api/stats")
assert employees.status_code == 200
assert employees.json()["total"] == 1
assert stats.status_code == 200
assert stats.json()["new_in_last_run"] == 1
app.dependency_overrides.clear()

View File

@@ -1,7 +1,7 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from app.models import Employee from app.models import CrawlRun, Employee
from app.services.crawler import _mark_dismissed from app.services.crawler import _mark_dismissed, _upsert_employee
def test_mark_dismissed_only_marks_missing_active_employees(db_session): def test_mark_dismissed_only_marks_missing_active_employees(db_session):
@@ -32,3 +32,27 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one() gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
assert gone.status == "dismissed" assert gone.status == "dismissed"
assert gone.dismissed_at is not None assert gone.dismissed_at is not None
def test_upsert_employee_increments_new_count_for_new_employee(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
_upsert_employee(
db_session,
run,
{
"source_url": "https://www.hse.ru/staff/newperson",
"profile_type": "staff",
"profile_id": "newperson",
"full_name": "New Person",
"tabs": [],
"sections": [],
"parser_version": "0.2.0",
"_html": "<html></html>",
},
)
db_session.commit()
assert run.new_count == 1

View File

@@ -0,0 +1,13 @@
from pathlib import Path
def test_employee_detail_template_is_human_readable():
template = Path("app/templates/employee_detail.html").read_text(encoding="utf-8")
assert "Current data" not in template
assert "<pre class=\"code\"" not in template
assert ">Tabs<" not in template
assert "Основная информация" in template
assert "Контакты" in template
assert "Разделы профиля" in template
assert "Snapshots" in template