Compare commits

..

43 Commits

Author SHA1 Message Date
5b9d71426d Merge pull request 'fix: support grouped HSE publication API responses' (#21) from fix/grouped-publications-parser into main
Reviewed-on: #21
2026-05-13 09:46:48 +00:00
Anton
efa7192e45 fix: support grouped HSE publication API responses 2026-05-13 12:46:07 +03:00
b27d613143 Merge pull request 'fix: remove mcp-auth from yml-file' (#20) from fix/remove-mcp-auth-compose into main
Reviewed-on: #20
2026-05-08 09:33:17 +00:00
Anton
a1ab1c0319 fix: remove mcp-auth from yml-file 2026-05-08 12:32:40 +03:00
0b4e04544d Merge pull request 'fix: remove MCP application-level authorization' (#19) from fix/remove-mcp-auth into main
Reviewed-on: #19
2026-05-08 09:15:18 +00:00
Anton
7593a460c7 fix: remove MCP application-level authorization 2026-05-08 12:14:19 +03:00
a4e7388bcf Merge pull request 'fix: use direct onclick handlers for run rows' (#18) from fix/direct-run-row-click-handler into main
Reviewed-on: #18
2026-05-07 15:25:26 +00:00
Anton
ac319b3ee5 fix: use direct onclick handlers for run rows 2026-05-07 18:23:14 +03:00
8e004c46ef Merge pull request 'fix: move run navigation from id link to table row' (#17) from fix/run-row-link-target into main
Reviewed-on: #17
2026-05-07 14:04:07 +00:00
Anton
7fa28e8e47 fix: move run navigation from id link to table row 2026-05-07 17:03:36 +03:00
1c4ad0bd9d Merge pull request 'fix: make run rows clickable and limit dashboard runs' (#16) from fix/dashboard-run-row-clicks into main
Reviewed-on: #16
2026-05-07 13:24:25 +00:00
Anton
52c5cc1af1 fix: make run rows clickable and limit dashboard runs 2026-05-07 16:23:39 +03:00
c97ced52b4 Merge pull request 'feat: make dashboard metrics and run rows clickable' (#15) from feature/dashboard-clickable-metrics into main
Reviewed-on: #15
2026-05-07 06:36:27 +00:00
Anton
deaecd8d3b feat: make dashboard metrics and run rows clickable 2026-05-07 09:35:44 +03:00
e4d4271e32 Merge pull request 'feat: track crawl run employee changes and verify dismissals' (#14) from feature/crawl-run-change-details into main
Reviewed-on: #14
2026-05-06 12:14:51 +00:00
Anton
d0459a2c30 feat: track crawl run employee changes and verify dismissals 2026-05-06 15:13:15 +03:00
Anton
2331c7a28d chore: removes sensitive data from docker file 2026-04-29 16:16:06 +03:00
064c34ea32 Merge pull request 'feat: adds oauth server to docker' (#13) from feature/add-oauth-server into main
Reviewed-on: #13
2026-04-29 12:59:55 +00:00
Anton
6a98ae4246 feat: adds oauth server to docker 2026-04-29 15:59:18 +03:00
a6f2883091 Merge pull request 'feat: requires OAuth-only auth mode for MCP agents' (#12) from feature/mcp-oauth-oidc into main
Reviewed-on: #12
2026-04-29 12:22:25 +00:00
Anton
d20b4f396b feat: requires OAuth-only auth mode for MCP agents 2026-04-29 15:08:18 +03:00
c7027bb503 Merge pull request 'feat: adds OAuth/OIDC authentication for MCP' (#11) from feature/mcp-oauth-oidc into main
Reviewed-on: #11
2026-04-29 11:35:00 +00:00
Anton
ad0b15cc6e feat: adds OAuth/OIDC authentication for MCP 2026-04-29 14:33:29 +03:00
af864ecb44 Merge pull request 'fix: enrich HSE profile parsing with publications and theses' (#10) from fix/hse-profile-parser-publications-vkr-pagination into main
Reviewed-on: #10
2026-04-29 11:16:17 +00:00
Anton
cc9481fc6c fix: enrich HSE profile parsing with publications and theses 2026-04-29 14:15:29 +03:00
cf578ce699 Merge pull request 'fix: allow empty CRAWL_LIMIT env value' (#9) from fix/empty-crawl-limit-env into main
Reviewed-on: #9
2026-04-29 09:50:34 +00:00
Anton
765efa1a1c fix: allow empty CRAWL_LIMIT env value 2026-04-29 12:49:58 +03:00
86330885e3 Merge pull request 'fix: localize admin UI and simplify employees navigation' (#8) from fix/admin-russian-ux-cleanup into main
Reviewed-on: #8
2026-04-29 09:39:42 +00:00
Anton
866e2b44d5 fix: localize admin UI and simplify employees navigation 2026-04-29 12:39:16 +03:00
f411de740e Merge pull request 'fix: avoid Jinja dict method collisions in admin templates' (#7) from fix/jinja-dict-method-collisions into main
Reviewed-on: #7
2026-04-29 09:12:13 +00:00
Anton
cdfbb26875 fix: avoid Jinja dict method collisions in admin templates 2026-04-29 12:11:16 +03:00
5eaad38076 Merge pull request 'fix: avoid Jinja dict items collision in employee card' (#6) from fix/employee-card-contact-items into main
Reviewed-on: #6
2026-04-29 08:35:13 +00:00
Anton
af87fa8af3 fix: avoid Jinja dict items collision in employee card 2026-04-29 11:34:46 +03:00
26db5832fd Merge pull request 'fix: harden admin employee views against malformed data' (#5) from fix/admin-employee-view-malformed-data into main
Reviewed-on: #5
2026-04-29 08:12:29 +00:00
Anton
7530cbdb60 fix: harden admin employee views against malformed data 2026-04-29 11:11:57 +03:00
ce90414654 Merge pull request 'fix: make employee detail page human-readable' (#4) from feature/human-readable-employee-card into main
Reviewed-on: #4
2026-04-29 07:38:21 +00:00
Anton
755135d6ba fix: make employee detail page human-readable 2026-04-29 10:37:38 +03:00
69ad41da66 Merge pull request 'fix: fixes TemplateResponse' (#3) from fix/template-response into main
Reviewed-on: #3
2026-04-28 15:27:30 +00:00
Anton
2cd7585107 fix: fixes TemplateResponse 2026-04-28 18:25:29 +03:00
Anton
b1edec4d9e chore: fix docker-files 2026-04-28 18:12:07 +03:00
7c83482ed7 Merge pull request 'feature: improve admin directory and crawl progress' (#2) from feature/admin-directory-progress into main
Reviewed-on: #2
2026-04-28 14:24:55 +00:00
Anton
4bd5f26469 feature: improve admin directory and crawl progress 2026-04-28 17:24:10 +03:00
51d83d7062 Merge pull request 'feature: add MIEM employees parser service with admin UI and MCP' (#1) from feature/miem-employees-server into main
Reviewed-on: #1
2026-04-28 13:21:21 +00:00
36 changed files with 2969 additions and 183 deletions

View File

@@ -14,7 +14,5 @@ PARSER_USE_PLAYWRIGHT=false
ADMIN_USERNAME=admin
ADMIN_PASSWORD=change-me
SESSION_SECRET=change-me-session-secret
MCP_TOKEN=change-me-mcp-token
API_PORT=8000
MCP_PORT=8001

1
.gitignore vendored
View File

@@ -4,6 +4,7 @@ __pycache__/
*.py[cod]
*.db
.pytest_cache/
pytest-cache-files-*/
.coverage
htmlcov/
postgres_data/

View File

@@ -6,7 +6,7 @@
- `api`: FastAPI, REST API, HTML-админка, healthcheck.
- `worker`: weekly scheduler, который запускает парсинг по `CRAWL_CRON`.
- `mcp`: HTTP MCP endpoint с bearer token.
- `mcp`: открытый HTTP MCP endpoint для ИИ-агентов.
- `postgres`: основная БД.
Парсер использует фиксированный источник сотрудников, по умолчанию `https://miem.hse.ru/persons`. Для каждой карточки сохраняются ФИО, должности, год начала работы, контакты, идентификаторы, вкладки профиля, секции, публикации, курсы, ВКР, JSON-снапшот и сжатый HTML-снапшот. Ссылки обходятся только из меню профиля самого сотрудника (`person-menu`), например `#sci`, `#teaching`, `#main`.
@@ -27,7 +27,6 @@ cp .env.example .env
- `CRAWL_LIMIT`: опциональный лимит профилей для тестового запуска.
- `ADMIN_USERNAME`, `ADMIN_PASSWORD`: логин и пароль админки.
- `SESSION_SECRET`: секрет подписи cookie.
- `MCP_TOKEN`: bearer token для `/mcp`.
- `PARSER_USE_PLAYWRIGHT`: включение Playwright-рендера динамических вкладок.
## Локальный запуск
@@ -41,6 +40,13 @@ uvicorn app.main:app --reload
Админка: `http://localhost:8000/admin`.
В админке доступны:
- `Dashboard`: общая статистика, последний добавленный сотрудник, прогресс текущего/последнего парсинга и ручной запуск.
- `Directory`: настраиваемая таблица сотрудников с фильтрами, сортировкой, пагинацией и выбором колонок.
- `Employees`: простая legacy-таблица сотрудников.
- `Runs`: история запусков, ошибки и progress bar.
## Docker Compose
```bash
@@ -57,7 +63,7 @@ docker compose up --build
## Парсинг
Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на странице `Runs` или через REST:
Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на `Dashboard` и странице `Runs` или через REST:
```bash
curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=..."
@@ -67,12 +73,15 @@ curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=.
- найденные сотрудники получают статус `active` и обновленный `last_seen_at`;
- новые сотрудники добавляются в `employees`;
- количество новых сотрудников за запуск сохраняется в `crawl_runs.new_count`;
- активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`;
- каждый успешный разбор сохраняет запись в `employee_snapshots`.
Во время выполнения парсинга `found_count`, `parsed_count` и `error_count` обновляются в базе. Админка опрашивает `/api/crawl-runs/latest` и показывает прогресс как `parsed_count + error_count / found_count`.
## MCP
Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>`.
Endpoint: `POST /mcp`, без авторизации на уровне приложения.
Поддерживаемые tools:
@@ -82,15 +91,16 @@ Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>
- `list_employee_courses(profile_id_or_url)`
- `get_crawl_status()`
Пример:
Пример локального legacy-режима со статическим токеном:
```bash
curl http://localhost:8001/mcp \
-H "Authorization: Bearer change-me-mcp-token" \
-H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","id":1,"method":"tools/list","params":{}}'
```
Если MCP нужно ограничить, делайте это на сетевом уровне: localhost binding, VPN, firewall, reverse proxy или другой внешний контур доступа.
## Обслуживание
```bash
@@ -100,4 +110,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql
docker compose down
```
Версия сервиса: `0.1.0`. Админка всегда показывает версии backend и frontend в footer.
Версия сервиса: `0.4.5`. Админка всегда показывает версии backend и frontend в footer.

View File

@@ -1,14 +1,22 @@
from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request, Response
from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy import desc, func, or_, select
from sqlalchemy import desc, func, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlError, CrawlRun, Employee
from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
from app.services.crawler import run_crawl
from app.services.admin_data import (
employee_detail_payload,
format_admin_datetime,
list_employees_page,
run_detail_payload,
run_payload,
stats_payload,
)
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/admin")
@@ -18,14 +26,12 @@ templates = Jinja2Templates(directory="app/templates")
@router.get("", response_class=HTMLResponse)
def dashboard(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings)
counts = {
"active": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "active")) or 0,
"dismissed": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "dismissed")) or 0,
"runs": db.scalar(select(func.count()).select_from(CrawlRun)) or 0,
"errors": db.scalar(select(func.count()).select_from(CrawlError)) or 0,
}
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all()
return _render(request, "dashboard.html", {"counts": counts, "runs": runs})
counts = stats_payload(db)
counts["runs"] = db.scalar(select(func.count()).select_from(CrawlRun)) or 0
counts["errors"] = db.scalar(select(func.count()).select_from(CrawlError)) or 0
run_models = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(5)).all()
runs = [run_payload(run) for run in run_models]
return _render(request, "dashboard.html", {"counts": counts, "runs": runs, "latest_run": runs[0] if runs else None})
@router.get("/login", response_class=HTMLResponse)
@@ -35,7 +41,6 @@ def login_form(request: Request):
@router.post("/login")
def login(
response: Response,
request: Request,
username: str = Form(...),
password: str = Form(...),
@@ -60,18 +65,61 @@ def employees(
request: Request,
status: str | None = None,
q: str | None = None,
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
return RedirectResponse("/admin/directory", status_code=303)
@router.get("/directory", response_class=HTMLResponse)
def directory(
request: Request,
status: str | None = None,
q: str | None = None,
started_from: str | None = None,
started_to: str | None = None,
has_email: str | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
stmt = select(Employee)
if status:
stmt = stmt.where(Employee.status == status)
if q:
pattern = f"%{q}%"
stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
items = db.scalars(stmt.order_by(Employee.full_name).limit(200)).all()
return _render(request, "employees.html", {"employees": items, "status": status or "", "q": q or ""})
parsed_started_from = _parse_date(started_from)
parsed_started_to = _parse_date(started_to)
parsed_has_email = None if has_email in (None, "") else has_email == "true"
page = list_employees_page(
db,
status=status,
q=q,
started_from=parsed_started_from,
started_to=parsed_started_to,
has_email=parsed_has_email,
sort=sort,
direction=direction,
limit=limit,
offset=offset,
)
return _render(
request,
"directory.html",
{
"page": page,
"filters": {
"status": status or "",
"q": q or "",
"started_from": started_from or "",
"started_to": started_to or "",
"has_email": has_email or "",
"sort": sort,
"direction": direction,
"limit": page["limit"],
"offset": offset,
},
},
)
@router.get("/employees/{employee_id}", response_class=HTMLResponse)
@@ -85,34 +133,82 @@ def employee_detail(
employee = db.get(Employee, employee_id)
if not employee:
return RedirectResponse("/admin/employees", status_code=303)
snapshots = sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
return _render(request, "employee_detail.html", {"employee": employee, "snapshots": snapshots})
snapshots = [
{
"captured_display": format_admin_datetime(snapshot.captured_at),
"checksum": snapshot.checksum,
"parser_version": snapshot.parser_version,
}
for snapshot in sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
]
return _render(
request,
"employee_detail.html",
{"employee": employee, "employee_view": employee_detail_payload(employee), "snapshots": snapshots},
)
@router.get("/runs", response_class=HTMLResponse)
def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings)
items = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(50)).all()
run_models = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(50)).all()
items = [run_payload(run) for run in run_models]
errors = db.scalars(select(CrawlError).order_by(desc(CrawlError.created_at)).limit(50)).all()
return _render(request, "runs.html", {"runs": items, "errors": errors})
@router.get("/runs/{run_id}", response_class=HTMLResponse)
def run_detail(
run_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
run = db.get(CrawlRun, run_id)
if not run:
return RedirectResponse("/admin/runs", status_code=303)
return _render(request, "run_detail.html", {"run": run_detail_payload(db, run)})
@router.post("/runs")
def trigger_run(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
if get_running_run(db):
return RedirectResponse("/admin/runs", status_code=303)
def _crawl() -> None:
with SessionLocal() as db:
run_crawl(db, settings)
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return RedirectResponse("/admin/runs", status_code=303)
@router.post("/crawl-now")
def crawl_now(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
if get_running_run(db):
return RedirectResponse("/admin", status_code=303)
def _crawl() -> None:
with SessionLocal() as db:
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return RedirectResponse("/admin", status_code=303)
def _render(request: Request, template: str, context: dict, status_code: int = 200) -> HTMLResponse:
payload = {
"request": request,
@@ -120,4 +216,15 @@ def _render(request: Request, template: str, context: dict, status_code: int = 2
"frontend_version": FRONTEND_VERSION,
**context,
}
return templates.TemplateResponse(template, payload, status_code=status_code)
return templates.TemplateResponse(request, template, payload, status_code=status_code)
def _parse_date(value: str | None):
if not value:
return None
try:
from datetime import date
return date.fromisoformat(value)
except ValueError:
return None

View File

@@ -1,12 +1,15 @@
from datetime import date
from fastapi import APIRouter, BackgroundTasks, Depends, Request
from sqlalchemy import desc, or_, select
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlRun, Employee
from app.security import require_admin
from app.services.crawler import run_crawl
from app.services.admin_data import employee_display_payload, list_employees_page, run_detail_payload, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/api")
@@ -22,20 +25,29 @@ def list_employees(
request: Request,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
stmt = select(Employee)
if status:
stmt = stmt.where(Employee.status == status)
if q:
pattern = f"%{q}%"
stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
employees = db.scalars(stmt.order_by(Employee.full_name).limit(limit).offset(offset)).all()
return {"items": [_employee_summary(item) for item in employees], "limit": limit, "offset": offset}
return list_employees_page(
db,
status=status,
q=q,
started_from=started_from,
started_to=started_to,
has_email=has_email,
sort=sort,
direction=direction,
limit=limit,
offset=offset,
)
@router.get("/employees/{employee_id}")
@@ -61,34 +73,67 @@ def list_crawl_runs(
) -> dict:
require_admin(request, settings)
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(limit)).all()
return {"items": [_run_summary(run) for run in runs]}
return {"items": [run_payload(run) for run in runs]}
@router.get("/crawl-runs/latest")
def latest_crawl_run(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
running = get_running_run(db)
latest = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return {"running": run_payload(running), "latest": run_payload(latest)}
@router.get("/crawl-runs/{run_id}")
def get_crawl_run(
run_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
run = db.get(CrawlRun, run_id)
if not run:
return {"error": "not_found"}
return run_detail_payload(db, run) or {"error": "not_found"}
@router.post("/crawl-runs")
def trigger_crawl(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
running = get_running_run(db)
if running:
return {"status": "already_running", "run": run_payload(running)}
def _crawl() -> None:
with SessionLocal() as db:
run_crawl(db, settings)
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return {"status": "scheduled"}
@router.get("/stats")
def stats(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
return stats_payload(db)
def _employee_summary(employee: Employee) -> dict:
return {
"id": employee.id,
"full_name": employee.full_name,
"status": employee.status,
"canonical_url": employee.canonical_url,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
}
return employee_display_payload(employee)
def _employee_detail(employee: Employee) -> dict:
@@ -99,15 +144,4 @@ def _employee_detail(employee: Employee) -> dict:
def _run_summary(run: CrawlRun) -> dict:
return {
"id": run.id,
"source_url": run.source_url,
"status": run.status,
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"error_count": run.error_count,
"dismissed_count": run.dismissed_count,
"message": run.message,
}
return run_payload(run) or {}

View File

@@ -1,5 +1,5 @@
from functools import lru_cache
from pydantic import Field
from pydantic import Field, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -17,8 +17,13 @@ class Settings(BaseSettings):
admin_username: str = "admin"
admin_password: str = "admin"
session_secret: str = Field(default="dev-session-secret", min_length=8)
mcp_token: str = "dev-mcp-token"
@field_validator("crawl_limit", mode="before")
@classmethod
def empty_crawl_limit_as_none(cls, value):
if value == "":
return None
return value
@lru_cache
def get_settings() -> Settings:

View File

@@ -4,10 +4,10 @@ from fastapi import APIRouter, Depends, Request
from sqlalchemy import desc, or_, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import get_db
from app.models import CrawlRun, Employee
from app.security import require_mcp_token
from app.services.admin_data import run_detail_payload
from app.version import BACKEND_VERSION
router = APIRouter(prefix="/mcp")
@@ -46,6 +46,15 @@ TOOLS = [
"description": "Return the latest crawl run status.",
"inputSchema": {"type": "object", "properties": {}},
},
{
"name": "get_crawl_run_details",
"description": "Return detailed employee changes and errors for one crawl run.",
"inputSchema": {
"type": "object",
"properties": {"run_id": {"type": "integer"}},
"required": ["run_id"],
},
},
]
@@ -53,9 +62,7 @@ TOOLS = [
async def mcp_http(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_mcp_token(request, settings)
payload = await request.json()
method = payload.get("method")
request_id = payload.get("id")
@@ -65,7 +72,7 @@ async def mcp_http(
if method == "initialize":
result = {
"protocolVersion": "2024-11-05",
"serverInfo": {"name": "miem-employees", "version": "0.1.0"},
"serverInfo": {"name": "miem-employees", "version": BACKEND_VERSION},
"capabilities": {"tools": {}},
}
elif method == "tools/list":
@@ -94,6 +101,9 @@ def _call_tool(db: Session, name: str, arguments: dict) -> dict:
if name == "get_crawl_status":
run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return _tool_response(_run_payload(run) if run else {"status": "never_run"})
if name == "get_crawl_run_details":
run = db.get(CrawlRun, int(arguments["run_id"]))
return _tool_response(run_detail_payload(db, run) if run else {"error": "not_found"})
raise ValueError(f"Unknown tool: {name}")

View File

@@ -41,6 +41,7 @@ class Employee(Base):
snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee")
class EmployeeSnapshot(Base):
@@ -69,10 +70,36 @@ class CrawlRun(Base):
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
message: Mapped[str | None] = mapped_column(Text)
employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run")
class CrawlRunEmployeeChange(Base):
__tablename__ = "crawl_run_employee_changes"
__table_args__ = (
Index("ix_crawl_run_employee_changes_run_id", "crawl_run_id"),
Index("ix_crawl_run_employee_changes_employee_id", "employee_id"),
Index("ix_crawl_run_employee_changes_change_type", "change_type"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
profile_url: Mapped[str] = mapped_column(Text, nullable=False)
full_name: Mapped[str | None] = mapped_column(Text)
change_type: Mapped[str] = mapped_column(String(32), nullable=False)
profile_available: Mapped[bool | None] = mapped_column()
message: Mapped[str | None] = mapped_column(Text)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
crawl_run: Mapped[CrawlRun] = relationship(back_populates="employee_changes")
employee: Mapped[Employee | None] = relationship(back_populates="crawl_run_changes")
class CrawlError(Base):
__tablename__ = "crawl_errors"

View File

@@ -164,6 +164,7 @@ def parse_person_profile(
header = extract_person_header(soup, normalized_url)
tabs = extract_person_tabs(soup, normalized_url)
sections = extract_sections(soup, normalized_url)
sections = enrich_sections_from_hse_widgets(session, soup, normalized_url, headers, timeout, sections)
internal_links = [tab["href"] for tab in tabs if tab.get("href")]
return {
@@ -183,6 +184,25 @@ def parse_person_profile(
}
def enrich_sections_from_hse_widgets(
session: Session,
soup: BeautifulSoup,
source_url: str,
headers: dict[str, str],
timeout: int,
sections: list[dict],
) -> list[dict]:
enriched = list(sections)
publications = _load_widget_publications(session, soup, headers, timeout)
if publications:
enriched = _upsert_publications_section(enriched, publications)
theses = _load_widget_graduation_theses(session, soup, source_url, headers, timeout)
if theses:
enriched = _upsert_graduation_theses_section(enriched, theses)
return enriched
def _render_with_playwright(source_url: str, fallback_html: str) -> str:
try:
from playwright.sync_api import sync_playwright
@@ -206,6 +226,117 @@ def _render_with_playwright(source_url: str, fallback_html: str) -> str:
return fallback_html
def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: dict[str, str], timeout: int) -> list[dict]:
script = soup.select_one('script[data-widget-name="AuthorSearch"][data-author]')
if not script:
return []
author_id = normalize_ws(script.get("data-author"))
if not author_id:
return []
publications = []
page_id = 1
per_page = 100
while page_id <= 20:
payload = {
"type": "ANY",
"filterParams": (
f'"acceptLanguage":"ru"|"fullTextPublicEnabled": 1|'
f'"pubsAuthor": {author_id}|"widgetName": "AuthorSearch"'
),
"paginationParams": {
"publsSort": ["TITLE_ASC"],
"publsCount": per_page,
"pageId": page_id,
},
}
try:
response = session.post(
"https://publications.hse.ru/api/searchPubs",
json=payload,
headers=headers,
timeout=timeout,
)
response.raise_for_status()
data = response.json()
except Exception:
return publications
result = data.get("result") if isinstance(data, dict) else {}
items = _extract_publication_items(result)
if not items:
break
publications.extend(_normalize_publication_item(item) for item in items)
total = int(result.get("total") or 0)
if not result.get("more") and len(publications) >= total:
break
page_id += 1
return _dedupe_publications(publications)
def _extract_publication_items(result: object) -> list[dict]:
if not isinstance(result, dict):
return []
return _flatten_publication_items(result.get("items"))
def _flatten_publication_items(value: object) -> list[dict]:
if isinstance(value, list):
return [item for item in value if _is_publication_item(item)]
if not isinstance(value, dict):
return []
nested_items = value.get("items")
if isinstance(nested_items, list):
return [item for item in nested_items if _is_publication_item(item)]
if isinstance(nested_items, dict):
return _flatten_publication_items(nested_items)
publications = []
for child in value.values():
publications.extend(_flatten_publication_items(child))
return publications
def _is_publication_item(value: object) -> bool:
return isinstance(value, dict) and ("id" in value or "title" in value)
def _load_widget_graduation_theses(
session: Session,
soup: BeautifulSoup,
source_url: str,
headers: dict[str, str],
timeout: int,
) -> list[dict]:
script = soup.select_one('script[src*="/n/stat/vkr/app.js"][data-person-id]')
if not script:
return []
person_id = normalize_ws(script.get("data-person-id"))
api_url = normalize_ws(script.get("data-api-url")) or "/n/vkr/api/"
if not person_id:
return []
request_headers = {**headers, "x-portal-language": "ru"}
try:
response = session.get(
urljoin(source_url, api_url),
params={"supervisorId": person_id},
headers=request_headers,
timeout=timeout,
)
response.raise_for_status()
data = response.json()
except Exception:
return []
items = data.get("data") if isinstance(data, dict) else []
if not isinstance(items, list):
return []
return [_normalize_vkr_item(item, source_url) for item in items if isinstance(item, dict)]
def _collect_between_h2(start_h2: Tag) -> list[Tag | NavigableString | str]:
nodes = []
for sibling in start_h2.next_siblings:
@@ -353,6 +484,122 @@ def _parse_vkr_items(nodes: list) -> list[str]:
return [item for item in dict.fromkeys(items) if item]
def _normalize_publication_item(item: dict) -> dict:
publication_id = str(item.get("id") or "").strip()
title = _html_to_text(item.get("title"))
year = item.get("year")
publication_type = str(item.get("type") or "").strip() or None
description = item.get("description") if isinstance(item.get("description"), dict) else {}
short_description = _localized_value(description.get("short")) or _localized_value(description.get("shortLeft"))
text = normalize_ws(" ".join(part for part in [title, str(year or ""), short_description] if part))
return {
"id": publication_id or None,
"title": title or publication_id,
"year": year,
"type": publication_type,
"url": f"https://publications.hse.ru/view/{publication_id}" if publication_id else None,
"text": text or title or publication_id,
}
def _normalize_vkr_item(item: dict, source_url: str) -> dict:
thesis_id = item.get("id")
program = item.get("learnProgram") if isinstance(item.get("learnProgram"), dict) else {}
org_unit = item.get("orgUnit") if isinstance(item.get("orgUnit"), dict) else {}
supervisors = []
for supervisor in item.get("supervisors") or []:
if not isinstance(supervisor, dict):
continue
name = normalize_ws(supervisor.get("name"))
url = normalize_ws(supervisor.get("url"))
if name or url:
supervisors.append({"name": name or url, "url": url or None})
return {
"id": thesis_id,
"student": normalize_ws(item.get("student")),
"title": normalize_ws(item.get("title")),
"defense_year": item.get("year"),
"level": normalize_ws(item.get("level")),
"rating": item.get("rating"),
"project_url": urljoin(source_url, f"/edu/vkr/{thesis_id}") if thesis_id else None,
"program": normalize_ws(program.get("title")),
"program_url": urljoin(source_url, program.get("url")) if program.get("url") else None,
"org_unit": normalize_ws(org_unit.get("title")),
"org_unit_url": urljoin(source_url, org_unit.get("url")) if org_unit.get("url") else None,
"supervisors": supervisors,
"text": normalize_ws(" ".join(str(part) for part in [item.get("student"), item.get("title"), item.get("year")] if part)),
}
def _upsert_publications_section(sections: list[dict], publications: list[dict]) -> list[dict]:
merged = []
inserted = False
for section in sections:
if section.get("type") != "publications":
merged.append(section)
continue
existing = section.get("publications") or []
section = {
**section,
"publications_count": max(section.get("publications_count") or 0, len(publications)),
"publications": _dedupe_publications([*existing, *publications]),
}
section["items"] = [item["text"] for item in section["publications"] if item.get("text")]
merged.append(section)
inserted = True
if not inserted:
merged.append(
{
"title": "Публикации и исследования",
"slug": "publikacii_i_issledovaniya",
"type": "publications",
"raw_text": "",
"paragraphs": [],
"items": [item["text"] for item in publications if item.get("text")],
"links": [],
"publications_count": len(publications),
"publications": publications,
}
)
return merged
def _upsert_graduation_theses_section(sections: list[dict], theses: list[dict]) -> list[dict]:
section = {
"title": "Выпускные квалификационные работы студентов НИУ ВШЭ",
"slug": "vypusknye_kvalifikacionnye_raboty_studentov_niu_vshe",
"type": "graduation_theses",
"raw_text": "",
"paragraphs": [],
"items": [item["text"] for item in theses if item.get("text")],
"links": [{"text": item["title"], "url": item["project_url"]} for item in theses if item.get("title") and item.get("project_url")],
"theses_count": len(theses),
"theses": theses,
}
return [item for item in sections if item.get("type") != "graduation_theses"] + [section]
def _dedupe_publications(items: list[dict]) -> list[dict]:
seen = set()
unique = []
for item in items:
key = item.get("id") or item.get("url") or item.get("title")
if key and key not in seen:
seen.add(key)
unique.append(item)
return unique
def _html_to_text(value: object) -> str:
return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))
def _localized_value(value: object) -> str:
if isinstance(value, dict):
return normalize_ws(value.get("ru") or value.get("publ") or value.get("en"))
return normalize_ws(str(value or ""))
def _slugify(value: str) -> str:
cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"

View File

@@ -44,9 +44,3 @@ def require_admin(request: Request, settings: Settings) -> str:
if not username:
raise HTTPException(status_code=status.HTTP_303_SEE_OTHER, headers={"Location": "/admin/login"})
return username
def require_mcp_token(request: Request, settings: Settings) -> None:
auth = request.headers.get("authorization", "")
if not auth.startswith("Bearer ") or not hmac.compare_digest(auth.removeprefix("Bearer ").strip(), settings.mcp_token):
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid MCP token")

462
app/services/admin_data.py Normal file
View File

@@ -0,0 +1,462 @@
from __future__ import annotations
from datetime import date, datetime, time
from math import ceil
from typing import Any
from zoneinfo import ZoneInfo
from sqlalchemy import Select, Text, and_, desc, func, or_, select
from sqlalchemy.orm import Session
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
EMPLOYEE_SORTS = {
"full_name": Employee.full_name,
"status": Employee.status,
"first_seen_at": Employee.first_seen_at,
"last_seen_at": Employee.last_seen_at,
"dismissed_at": Employee.dismissed_at,
"hse_start_year": Employee.current_data["hse_start_year"].as_integer(),
}
def employee_display_payload(employee: Employee) -> dict[str, Any]:
data = _as_dict(employee.current_data)
contacts = _as_dict(data.get("contacts"))
sections = _as_list(data.get("sections"))
positions = _clean_list(data.get("positions"))
emails = _clean_list(contacts.get("emails"))
phones = _clean_list(contacts.get("phones"))
return {
"id": employee.id,
"full_name": employee.full_name,
"status": employee.status,
"status_display": _employee_status_display(employee.status),
"canonical_url": employee.canonical_url,
"positions": positions,
"positions_text": "; ".join(positions),
"hse_start_year": data.get("hse_start_year"),
"emails": emails,
"email_text": ", ".join(emails),
"phones": phones,
"phone_text": ", ".join(phones),
"address": contacts.get("address"),
"publications_count": _count_section_items(sections, "publications"),
"courses_count": _count_section_items(sections, "courses_by_year"),
"first_seen_at": employee.first_seen_at.isoformat() if employee.first_seen_at else None,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
"first_seen_display": format_admin_datetime(employee.first_seen_at),
"last_seen_display": format_admin_datetime(employee.last_seen_at),
"dismissed_display": format_admin_datetime(employee.dismissed_at),
}
def employee_detail_payload(employee: Employee) -> dict[str, Any]:
data = _as_dict(employee.current_data)
contacts = _as_dict(data.get("contacts"))
return {
**employee_display_payload(employee),
"profile_type": employee.profile_type or data.get("profile_type"),
"profile_id": employee.profile_id or data.get("profile_id"),
"parser_version": employee.parser_version or data.get("parser_version"),
"contacts": {
"emails": _clean_list(contacts.get("emails")),
"phones": _clean_list(contacts.get("phones")),
"address": contacts.get("address"),
"contact_items": _normalize_contact_items(contacts.get("items")),
},
"external_ids": _normalize_external_ids(data.get("external_ids")),
"sections": [_normalize_section(section) for section in _as_list(data.get("sections"))],
}
def build_employee_query(
*,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
) -> Select[tuple[Employee]]:
stmt = select(Employee)
filters = []
if status:
filters.append(Employee.status == status)
if q:
pattern = f"%{q}%"
filters.append(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
if started_from:
filters.append(Employee.first_seen_at >= datetime.combine(started_from, time.min))
if started_to:
filters.append(Employee.first_seen_at <= datetime.combine(started_to, time.max))
if has_email is True:
filters.append(Employee.current_data.cast(Text).ilike("%@%"))
elif has_email is False:
filters.append(or_(Employee.current_data.is_(None), ~Employee.current_data.cast(Text).ilike("%@%")))
if filters:
stmt = stmt.where(and_(*filters))
return stmt
def list_employees_page(
db: Session,
*,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
) -> dict[str, Any]:
limit = limit if limit in {25, 50, 100} else 50
offset = max(0, offset)
base_stmt = build_employee_query(
status=status,
q=q,
started_from=started_from,
started_to=started_to,
has_email=has_email,
)
total = db.scalar(select(func.count()).select_from(base_stmt.subquery())) or 0
sort_column = EMPLOYEE_SORTS.get(sort, Employee.full_name)
order = desc(sort_column) if direction == "desc" else sort_column
employees = db.scalars(base_stmt.order_by(order).limit(limit).offset(offset)).all()
return {
"employees": [employee_display_payload(employee) for employee in employees],
"total": total,
"limit": limit,
"offset": offset,
"pages": ceil(total / limit) if total else 0,
"page": (offset // limit) + 1,
}
def stats_payload(db: Session) -> dict[str, Any]:
latest_run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
running_run = db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
latest_added = db.scalar(select(Employee).order_by(desc(Employee.first_seen_at)).limit(1))
return {
"total": db.scalar(select(func.count()).select_from(Employee)) or 0,
"active": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "active")) or 0,
"dismissed": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "dismissed")) or 0,
"new_in_last_run": latest_run.new_count if latest_run else 0,
"latest_added": employee_display_payload(latest_added) if latest_added else None,
"latest_run": run_payload(latest_run) if latest_run else None,
"current_running_run": run_payload(running_run) if running_run else None,
}
def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
if not run:
return None
processed = run.parsed_count + run.error_count
percent = round((processed / run.found_count) * 100, 1) if run.found_count else 0
return {
"id": run.id,
"source_url": run.source_url,
"status": run.status,
"status_display": _run_status_display(run.status),
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"started_display": format_admin_datetime(run.started_at),
"finished_display": format_admin_datetime(run.finished_at),
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"new_count": run.new_count,
"error_count": run.error_count,
"dismissed_count": run.dismissed_count,
"processed_count": processed,
"progress_percent": percent,
"message": run.message,
}
def run_detail_payload(db: Session, run: CrawlRun | None) -> dict[str, Any] | None:
if not run:
return None
changes = db.scalars(
select(CrawlRunEmployeeChange)
.where(CrawlRunEmployeeChange.crawl_run_id == run.id)
.order_by(CrawlRunEmployeeChange.created_at, CrawlRunEmployeeChange.id)
).all()
errors = db.scalars(select(CrawlError).where(CrawlError.crawl_run_id == run.id).order_by(CrawlError.created_at)).all()
grouped_changes = {"new": [], "missing_from_source": [], "dismissed": []}
for change in changes:
grouped_changes.setdefault(change.change_type, []).append(_change_payload(change))
return {
**(run_payload(run) or {}),
"changes_detail_available": bool(changes),
"changes": grouped_changes,
"errors": [_crawl_error_payload(error) for error in errors],
}
def format_admin_datetime(value: Any) -> str:
if not value:
return "Не указано"
if isinstance(value, str):
try:
value = datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return value
if not isinstance(value, datetime):
return str(value)
if value.tzinfo:
value = value.astimezone(ZoneInfo("Europe/Moscow"))
return value.strftime("%d.%m.%Y %H:%M")
def _employee_status_display(status: str | None) -> str:
labels = {"active": "Работает", "dismissed": "Уволен"}
return labels.get(status or "", status or "Не указано")
def _run_status_display(status: str | None) -> str:
labels = {"running": "Выполняется", "completed": "Завершен", "failed": "Ошибка"}
return labels.get(status or "", status or "Не указано")
def _change_payload(change: CrawlRunEmployeeChange) -> dict[str, Any]:
return {
"id": change.id,
"employee_id": change.employee_id,
"profile_key": change.profile_key,
"profile_url": change.profile_url,
"full_name": change.full_name,
"change_type": change.change_type,
"change_type_display": _change_type_display(change.change_type),
"profile_available": change.profile_available,
"profile_available_display": _profile_available_display(change.profile_available),
"message": change.message,
"created_at": change.created_at.isoformat() if change.created_at else None,
"created_display": format_admin_datetime(change.created_at),
}
def _crawl_error_payload(error: CrawlError) -> dict[str, Any]:
return {
"id": error.id,
"crawl_run_id": error.crawl_run_id,
"profile_url": error.profile_url,
"error_type": error.error_type,
"message": error.message,
"created_at": error.created_at.isoformat() if error.created_at else None,
"created_display": format_admin_datetime(error.created_at),
}
def _change_type_display(change_type: str | None) -> str:
labels = {
"new": "Новый",
"missing_from_source": "Потеряшка",
"dismissed": "Уволен",
}
return labels.get(change_type or "", change_type or "Не указано")
def _profile_available_display(value: bool | None) -> str:
if value is True:
return "Профиль доступен"
if value is False:
return "Профиль недоступен"
return "Не проверялось"
def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
total = 0
for section in sections:
if section.get("type") != section_type:
continue
if section_type == "publications":
total += len(section.get("publications") or section.get("items") or [])
elif section_type == "courses_by_year":
total += len(section.get("courses") or [])
return total
def _clean_list(values: Any) -> list[str]:
if values is None:
return []
if not isinstance(values, list):
values = [values]
return [str(value).strip() for value in values if str(value or "").strip()]
def _as_dict(value: Any) -> dict[str, Any]:
return value if isinstance(value, dict) else {}
def _as_list(value: Any) -> list[Any]:
if value is None:
return []
return value if isinstance(value, list) else [value]
def _normalize_contact_items(items: Any) -> list[str]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if isinstance(item, dict):
value = item.get("raw") or item.get("value") or item.get("text")
else:
value = item
value = str(value or "").strip()
if value:
normalized.append(value)
return normalized
def _normalize_external_ids(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
system = str(item.get("system") or "").strip()
value = str(item.get("value") or "").strip()
url = str(item.get("url") or "").strip()
if system or value or url:
normalized.append({"system": system or "ID", "value": value or url, "url": url or None})
return normalized
def _normalize_section(section: Any) -> dict[str, Any]:
if not isinstance(section, dict):
return {"title": "Раздел", "type": "generic", "paragraphs": [str(section)], "items": [], "links": []}
section_type = section.get("type") or "generic"
paragraphs = _clean_list(section.get("paragraphs"))
items = _clean_list(section.get("items"))
raw_text = str(section.get("raw_text") or "").strip()
if not paragraphs and not items and raw_text:
paragraphs = [raw_text]
return {
"title": section.get("title") or "Раздел",
"type": section_type,
"raw_text": raw_text,
"paragraphs": paragraphs,
"list_items": items,
"links": _normalize_links(section.get("links")),
"year_entries": _normalize_year_entries(section.get("year_entries")),
"publications": _normalize_publications(section.get("publications")),
"publications_count": section.get("publications_count"),
"theses": _normalize_theses(section.get("theses")),
"theses_count": section.get("theses_count"),
"academic_year": section.get("academic_year"),
"courses": _normalize_courses(section.get("courses")),
"table": _normalize_table(section.get("table")),
}
def _normalize_links(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
text = str(item.get("text") or item.get("url") or "").strip()
url = str(item.get("url") or "").strip()
if text and url:
normalized.append({"text": text, "url": url})
return normalized
def _normalize_year_entries(items: Any) -> list[dict[str, Any]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
text = str(item.get("text") or "").strip()
if text:
normalized.append({"year": item.get("year"), "text": text, "links": _normalize_links(item.get("links"))})
return normalized
def _normalize_publications(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
text = str(item or "").strip()
if text:
normalized.append({"title": text, "text": text, "url": None})
continue
title = str(item.get("title") or "").strip()
text = str(item.get("text") or title).strip()
url = str(item.get("url") or "").strip()
if title or text:
normalized.append({"title": title or text, "text": text or title, "url": url or None})
return normalized
def _normalize_courses(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
title = str(item or "").strip()
if title:
normalized.append({"title": title, "url": None})
continue
title = str(item.get("title") or "").strip()
url = str(item.get("url") or "").strip()
if title or url:
normalized.append({"title": title or url, "url": url or None})
return normalized
def _normalize_theses(items: Any) -> list[dict[str, Any]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
title = str(item.get("title") or "").strip()
student = str(item.get("student") or "").strip()
if not title and not student:
continue
normalized.append(
{
"id": item.get("id"),
"student": student,
"title": title,
"defense_year": item.get("defense_year") or item.get("year"),
"level": str(item.get("level") or "").strip(),
"rating": item.get("rating"),
"project_url": str(item.get("project_url") or "").strip() or None,
"program": str(item.get("program") or "").strip(),
"program_url": str(item.get("program_url") or "").strip() or None,
"org_unit": str(item.get("org_unit") or "").strip(),
"org_unit_url": str(item.get("org_unit_url") or "").strip() or None,
}
)
return normalized
def _normalize_table(table: Any) -> dict[str, Any] | None:
if not isinstance(table, dict):
return None
headers = _clean_list(table.get("headers"))
rows = []
for row in table.get("rows") or []:
if not isinstance(row, dict):
continue
cells = _clean_list(row.get("cells"))
if cells:
rows.append({"cells": cells, "link_url": row.get("link_url")})
if not headers and not rows:
return None
return {"headers": headers, "rows": rows}

View File

@@ -0,0 +1,17 @@
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlRun
from app.services.crawler import run_crawl
def get_running_run(db: Session) -> CrawlRun | None:
return db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
def run_crawl_if_idle(db: Session, settings: Settings) -> tuple[CrawlRun, bool]:
running = get_running_run(db)
if running:
return running, False
return run_crawl(db, settings), True

View File

@@ -9,7 +9,7 @@ from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key
@@ -68,7 +68,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
finally:
time.sleep(settings.request_delay_seconds)
run.dismissed_count = _mark_dismissed(db, found_keys)
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.status = "completed"
except Exception as exc:
run.status = "failed"
@@ -106,6 +106,10 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
first_seen_at=now,
)
db.add(employee)
run.new_count += 1
is_new = True
else:
is_new = False
employee.full_name = parsed.get("full_name")
employee.status = "active"
@@ -116,6 +120,16 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
employee.current_checksum = checksum
db.flush()
if is_new:
_record_employee_change(
db,
run,
employee,
"new",
profile_available=True,
message="Сотрудник впервые найден в источнике.",
)
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
db.add(
@@ -140,20 +154,70 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
return employee
def _mark_dismissed(db: Session, found_keys: set[str]) -> int:
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
dismissed = 0
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
now = datetime.now(timezone.utc)
for employee in active:
if employee.profile_key in found_keys:
continue
profile_available = _profile_is_available(session, employee.canonical_url, timeout)
if profile_available:
_record_employee_change(
db,
run,
employee,
"missing_from_source",
profile_available=True,
message="Профиль доступен, но ссылка отсутствует в исходном списке.",
)
continue
employee.status = "dismissed"
employee.dismissed_at = now
_record_employee_change(
db,
run,
employee,
"dismissed",
profile_available=False,
message="Сотрудник отсутствует в исходном списке, профиль не подтвердился как доступный.",
)
dismissed += 1
db.commit()
return dismissed
def _profile_is_available(session: requests.Session, url: str, timeout: int) -> bool:
try:
response = session.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
return response.status_code < 400
except requests.RequestException:
return False
def _record_employee_change(
db: Session,
run: CrawlRun,
employee: Employee,
change_type: str,
*,
profile_available: bool | None,
message: str,
) -> None:
db.add(
CrawlRunEmployeeChange(
crawl_run_id=run.id,
employee_id=employee.id,
profile_key=employee.profile_key,
profile_url=employee.canonical_url,
full_name=employee.full_name,
change_type=change_type,
profile_available=profile_available,
message=message,
)
)
def _checksum(data: dict) -> str:
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()

View File

@@ -1,6 +1,8 @@
.admin {
margin: 0;
min-height: 100vh;
display: flex;
flex-direction: column;
color: #1f2937;
background: #f6f7f9;
font-family: Arial, sans-serif;
@@ -21,6 +23,11 @@
font-size: 20px;
}
.admin__brand-link {
color: inherit;
text-decoration: none;
}
.admin__nav {
display: flex;
align-items: center;
@@ -34,6 +41,7 @@
}
.admin__main {
flex: 1;
width: min(1180px, calc(100% - 32px));
margin: 28px auto;
}
@@ -52,18 +60,30 @@
}
.metric {
display: block;
padding: 18px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.metric--link {
color: inherit;
text-decoration: none;
}
.metric--link:hover {
border-color: #0f766e;
}
.metric__label {
display: block;
color: #6b7280;
font-size: 13px;
}
.metric__value {
display: block;
margin-top: 8px;
font-size: 28px;
font-weight: 700;
@@ -87,6 +107,14 @@
border-collapse: collapse;
}
.table__row {
cursor: pointer;
}
.table__row:hover {
background: #f0fdfa;
}
.table__cell,
.table__head {
padding: 10px 8px;
@@ -151,3 +179,444 @@
border-radius: 8px;
white-space: pre-wrap;
}
.employee-card {
display: grid;
gap: 18px;
}
.employee-card__header {
display: flex;
align-items: flex-start;
justify-content: space-between;
gap: 18px;
padding: 22px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.employee-card__identity {
display: grid;
gap: 10px;
}
.employee-card__title {
margin: 0;
font-size: 24px;
}
.employee-card__section {
padding: 20px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.employee-card__meta {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
gap: 14px;
margin: 0;
}
.employee-card__meta-item {
min-width: 0;
}
.employee-card__meta-item--wide {
grid-column: 1 / -1;
}
.employee-card__meta-label {
margin-bottom: 5px;
color: #6b7280;
font-size: 12px;
font-weight: 700;
text-transform: uppercase;
}
.employee-card__meta-value {
margin: 0;
color: #1f2937;
line-height: 1.45;
}
.employee-card__list {
display: grid;
gap: 8px;
margin: 0;
padding-left: 18px;
}
.employee-card__list-item {
line-height: 1.45;
}
.employee-card__sections {
display: grid;
gap: 14px;
}
.employee-section {
padding: 16px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 8px;
}
.employee-section__header {
display: flex;
align-items: flex-start;
justify-content: space-between;
gap: 12px;
margin-bottom: 12px;
}
.employee-section__title {
margin: 0;
font-size: 17px;
}
.employee-section__type {
flex: 0 0 auto;
padding: 3px 8px;
color: #475569;
background: #e2e8f0;
border-radius: 999px;
font-size: 12px;
}
.employee-section__note {
margin: 0 0 10px;
color: #4b5563;
font-weight: 700;
}
.employee-section__text {
margin: 0 0 10px;
line-height: 1.55;
}
.employee-section__meta {
display: flex;
flex-wrap: wrap;
gap: 8px 12px;
color: #4b5563;
font-size: 13px;
}
.employee-section__meta-item {
line-height: 1.4;
}
.employee-section__table-wrap {
overflow-x: auto;
}
.employee-section__table {
width: 100%;
border-collapse: collapse;
background: #ffffff;
}
.employee-section__head,
.employee-section__cell {
padding: 10px;
border-bottom: 1px solid #e5e7eb;
text-align: left;
vertical-align: top;
}
.employee-section__head {
color: #374151;
background: #f3f4f6;
font-size: 13px;
}
.employee-section__links {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 12px;
}
.employee-section__link {
padding: 5px 9px;
color: #0f766e;
background: #ccfbf1;
border-radius: 999px;
font-size: 12px;
font-weight: 700;
text-decoration: none;
}
.stats-strip {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
gap: 14px;
margin-top: 16px;
}
.stats-strip__item {
display: block;
padding: 14px 16px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.stats-strip__item--link {
color: inherit;
text-decoration: none;
}
.stats-strip__item--link:hover {
border-color: #0f766e;
}
.stats-strip__label {
display: block;
color: #6b7280;
font-size: 12px;
text-transform: uppercase;
}
.stats-strip__value {
display: block;
margin-top: 6px;
color: #1f2937;
font-weight: 700;
}
.progress-panel {
display: grid;
gap: 12px;
}
.progress-panel__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 16px;
}
.progress-panel__body {
display: grid;
gap: 10px;
}
.progress-panel__meta {
display: flex;
flex-wrap: wrap;
gap: 12px;
color: #4b5563;
font-size: 14px;
}
.progress-panel__percent {
color: #0f766e;
font-weight: 700;
}
.progress-panel__empty {
margin: 0;
color: #6b7280;
}
.progress-bar {
height: 12px;
overflow: hidden;
background: #e5e7eb;
border-radius: 999px;
}
.progress-bar__fill {
height: 100%;
width: 0;
background: #0f766e;
transition: width 0.25s ease;
}
.directory {
display: grid;
gap: 18px;
}
.directory__header {
display: flex;
align-items: end;
justify-content: space-between;
gap: 16px;
}
.directory__title {
margin: 0;
font-size: 24px;
}
.directory__summary {
margin: 6px 0 0;
color: #6b7280;
}
.directory__filters {
display: grid;
grid-template-columns: minmax(220px, 1.7fr) repeat(6, minmax(120px, 1fr));
gap: 10px;
padding: 16px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.directory__input {
min-width: 0;
padding: 10px 12px;
border: 1px solid #cbd5e1;
border-radius: 6px;
}
.directory__table-wrap {
overflow-x: auto;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.directory__pagination {
display: flex;
align-items: center;
justify-content: center;
gap: 16px;
}
.directory__page {
color: #4b5563;
font-weight: 700;
}
.directory-table {
width: 100%;
min-width: 1120px;
border-collapse: collapse;
}
.directory-table__head {
padding: 12px 10px;
color: #374151;
background: #f9fafb;
border-bottom: 1px solid #e5e7eb;
font-size: 13px;
text-align: left;
white-space: nowrap;
}
.directory-table__cell {
max-width: 280px;
padding: 12px 10px;
border-bottom: 1px solid #e5e7eb;
vertical-align: top;
}
.directory-table__row {
cursor: pointer;
}
.directory-table__row:hover {
background: #f0fdfa;
}
.directory-table__empty {
padding: 28px;
color: #6b7280;
text-align: center;
}
.directory-table__cell--hidden,
.directory-table__head--hidden {
display: none;
}
.columns-modal {
position: fixed;
inset: 0;
z-index: 50;
display: grid;
place-items: center;
padding: 20px;
}
.columns-modal[hidden] {
display: none;
}
.columns-modal__backdrop {
position: absolute;
inset: 0;
background: rgba(17, 24, 39, 0.54);
}
.columns-modal__panel {
position: relative;
width: min(620px, 100%);
max-height: min(720px, calc(100vh - 40px));
overflow: auto;
padding: 20px;
background: #ffffff;
border-radius: 8px;
box-shadow: 0 24px 80px rgba(15, 23, 42, 0.22);
}
.columns-modal__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
}
.columns-modal__title {
margin: 0;
font-size: 18px;
}
.columns-modal__grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 10px;
margin-top: 18px;
}
.columns-modal__option {
display: flex;
align-items: center;
gap: 8px;
padding: 10px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 6px;
}
.columns-modal__checkbox {
width: 16px;
height: 16px;
}
@media (max-width: 920px) {
.directory__filters {
grid-template-columns: 1fr 1fr;
}
.progress-panel__header,
.directory__header,
.employee-card__header {
align-items: stretch;
flex-direction: column;
}
}
@media (max-width: 620px) {
.directory__filters {
grid-template-columns: 1fr;
}
}

125
app/static/admin.js Normal file
View File

@@ -0,0 +1,125 @@
(function () {
const columnDefaults = [
"full_name",
"status",
"positions",
"hse_start_year",
"email",
"last_seen_at",
"dismissed_at",
"profile",
];
const storageKey = "miem.directory.columns";
function readColumns() {
try {
const stored = JSON.parse(localStorage.getItem(storageKey) || "[]");
return Array.isArray(stored) && stored.length ? stored : columnDefaults;
} catch (_error) {
return columnDefaults;
}
}
function writeColumns(columns) {
localStorage.setItem(storageKey, JSON.stringify(columns));
}
function applyColumns(columns) {
document.querySelectorAll("[data-column]").forEach((node) => {
const visible = columns.includes(node.dataset.column);
node.classList.toggle("directory-table__cell--hidden", !visible && node.classList.contains("directory-table__cell"));
node.classList.toggle("directory-table__head--hidden", !visible && node.classList.contains("directory-table__head"));
});
document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
checkbox.checked = columns.includes(checkbox.value);
});
}
function setupColumns() {
if (!document.querySelector("[data-directory-table]")) return;
let columns = readColumns();
const modal = document.querySelector("[data-columns-modal]");
applyColumns(columns);
document.querySelectorAll("[data-columns-open]").forEach((button) => {
button.addEventListener("click", () => {
modal.hidden = false;
});
});
document.querySelectorAll("[data-columns-close]").forEach((button) => {
button.addEventListener("click", () => {
modal.hidden = true;
});
});
document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
checkbox.addEventListener("change", () => {
columns = Array.from(document.querySelectorAll("[data-column-toggle]:checked")).map((item) => item.value);
if (!columns.length) columns = ["full_name"];
writeColumns(columns);
applyColumns(columns);
});
});
}
function setupClickableRows() {
const openRow = (row) => {
window.location.href = row.dataset.rowHref;
};
document.querySelectorAll("[data-row-href]").forEach((row) => {
row.addEventListener("click", (event) => {
if (event.target.closest("a, button, input, select, label")) return;
openRow(row);
});
row.addEventListener("keydown", (event) => {
if (!["Enter", " "].includes(event.key)) return;
if (event.target.closest("a, button, input, select, label")) return;
event.preventDefault();
openRow(row);
});
});
}
function setupProgress() {
const panel = document.querySelector("[data-progress-panel]");
if (!panel) return;
const update = (run) => {
if (!run) return;
const status = document.querySelector("[data-progress-status]");
const processed = document.querySelector("[data-progress-processed]");
const found = document.querySelector("[data-progress-found]");
const errors = document.querySelector("[data-progress-errors]");
const fill = document.querySelector("[data-progress-fill]");
const percent = document.querySelector("[data-progress-percent]");
if (status) status.textContent = run.status_display || run.status;
if (processed) processed.textContent = run.processed_count;
if (found) found.textContent = run.found_count;
if (errors) errors.textContent = run.error_count;
if (fill) fill.style.width = `${run.progress_percent}%`;
if (percent) percent.textContent = run.progress_percent;
};
const poll = async () => {
try {
const response = await fetch("/api/crawl-runs/latest", { credentials: "same-origin" });
if (!response.ok) return false;
const data = await response.json();
const run = data.running || data.latest;
update(run);
return Boolean(data.running);
} catch (_error) {
return false;
}
};
const interval = window.setInterval(async () => {
const keepGoing = await poll();
if (!keepGoing) window.clearInterval(interval);
}, 4000);
}
setupColumns();
setupClickableRows();
setupProgress();
})();

View File

@@ -8,13 +8,13 @@
</head>
<body class="admin">
<header class="admin__header">
<h1 class="admin__brand">MIEM Employees</h1>
<h1 class="admin__brand"><a class="admin__brand-link" href="/admin">MIEM Employees</a></h1>
<nav class="admin__nav">
<a class="admin__link" href="/admin">Dashboard</a>
<a class="admin__link" href="/admin/employees">Employees</a>
<a class="admin__link" href="/admin/runs">Runs</a>
<a class="admin__link" href="/admin">Обзор</a>
<a class="admin__link" href="/admin/directory">Сотрудники</a>
<a class="admin__link" href="/admin/runs">Запуски</a>
<form method="post" action="/admin/logout">
<button class="button button--ghost" type="submit">Logout</button>
<button class="button button--ghost" type="submit">Выйти</button>
</form>
</nav>
</header>
@@ -24,5 +24,6 @@
<footer class="admin__footer">
Backend {{ backend_version }} · Frontend {{ frontend_version }}
</footer>
{% block scripts %}{% endblock %}
</body>
</html>

View File

@@ -1,21 +1,62 @@
{% extends "base.html" %}
{% block title %}Dashboard · MIEM Employees{% endblock %}
{% block title %}Обзор · MIEM Employees{% endblock %}
{% block content %}
<section class="admin__grid">
<div class="metric"><div class="metric__label">Active</div><div class="metric__value">{{ counts.active }}</div></div>
<div class="metric"><div class="metric__label">Dismissed</div><div class="metric__value">{{ counts.dismissed }}</div></div>
<div class="metric"><div class="metric__label">Runs</div><div class="metric__value">{{ counts.runs }}</div></div>
<div class="metric"><div class="metric__label">Errors</div><div class="metric__value">{{ counts.errors }}</div></div>
<a class="metric metric--link" href="/admin/directory"><span class="metric__label">Всего в базе</span><span class="metric__value">{{ counts.total }}</span></a>
<a class="metric metric--link" href="/admin/directory?status=active"><span class="metric__label">Работают</span><span class="metric__value">{{ counts.active }}</span></a>
<a class="metric metric--link" href="{% if latest_run %}/admin/runs/{{ latest_run.id }}#new-employees{% else %}/admin/runs{% endif %}"><span class="metric__label">Новые за запуск</span><span class="metric__value">{{ counts.new_in_last_run }}</span></a>
<a class="metric metric--link" href="/admin/directory?status=dismissed"><span class="metric__label">Уволены</span><span class="metric__value">{{ counts.dismissed }}</span></a>
</section>
<section class="stats-strip">
<div class="stats-strip__item">
<span class="stats-strip__label">Последний добавленный</span>
{% if counts.latest_added %}
<a class="stats-strip__value" href="/admin/employees/{{ counts.latest_added.id }}">{{ counts.latest_added.full_name or counts.latest_added.canonical_url }}</a>
{% else %}
<span class="stats-strip__value">Сотрудников пока нет</span>
{% endif %}
</div>
<a class="stats-strip__item stats-strip__item--link" href="/admin/runs">
<span class="stats-strip__label">Запуски</span>
<span class="stats-strip__value">{{ counts.runs }}</span>
</a>
<div class="stats-strip__item">
<span class="stats-strip__label">Ошибки</span>
<span class="stats-strip__value">{{ counts.errors }}</span>
</div>
</section>
<section class="panel progress-panel" data-progress-panel>
<div class="progress-panel__header">
<h2 class="panel__title">Прогресс парсинга</h2>
<form method="post" action="/admin/crawl-now">
<button class="button" type="submit">Запустить парсинг</button>
</form>
</div>
{% set run = counts.current_running_run or latest_run %}
<div class="progress-panel__body" data-progress-body>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status_display if run else "Ожидание" }}</span>
<span>обработано: <span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span></span>
<span>ошибок: <span data-progress-errors>{{ run.error_count if run else 0 }}</span></span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ run.progress_percent if run else 0 }}%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>{{ run.progress_percent if run else 0 }}</span>%</div>
</div>
</section>
<section class="panel">
<h2 class="panel__title">Latest runs</h2>
<h2 class="panel__title">Последние запуски</h2>
<table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Parsed</th><th class="table__head">Errors</th><th class="table__head">Started</th></tr></thead>
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Обработано</th><th class="table__head">Ошибки</th><th class="table__head">Старт</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_at }}</td></tr>
<tr class="table__row" onclick="window.location.href='/admin/runs/{{ run.id }}'" onkeydown="if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); window.location.href='/admin/runs/{{ run.id }}'; }" role="link" tabindex="0"><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -0,0 +1,118 @@
{% extends "base.html" %}
{% block title %}Сотрудники · MIEM Employees{% endblock %}
{% block content %}
<section class="directory">
<div class="directory__header">
<div>
<h2 class="directory__title">Сотрудники</h2>
<p class="directory__summary">Найдено: {{ page.total }}</p>
</div>
<button class="button" type="button" data-columns-open>Колонки</button>
</div>
<form class="directory__filters" method="get" action="/admin/directory">
<input class="directory__input" name="q" value="{{ filters.q }}" placeholder="ФИО или ссылка">
<select class="directory__input" name="status">
<option value="" {% if not filters.status %}selected{% endif %}>Все статусы</option>
<option value="active" {% if filters.status == "active" %}selected{% endif %}>Работает</option>
<option value="dismissed" {% if filters.status == "dismissed" %}selected{% endif %}>Уволен</option>
</select>
<select class="directory__input" name="has_email">
<option value="" {% if not filters.has_email %}selected{% endif %}>Любой email</option>
<option value="true" {% if filters.has_email == "true" %}selected{% endif %}>Есть email</option>
<option value="false" {% if filters.has_email == "false" %}selected{% endif %}>Нет email</option>
</select>
<input class="directory__input" type="date" name="started_from" value="{{ filters.started_from }}" aria-label="Впервые найден с">
<input class="directory__input" type="date" name="started_to" value="{{ filters.started_to }}" aria-label="Впервые найден по">
<select class="directory__input" name="sort">
{% for value, label in [("full_name", "ФИО"), ("status", "Статус"), ("hse_start_year", "Год начала"), ("first_seen_at", "Впервые найден"), ("last_seen_at", "Последний раз найден"), ("dismissed_at", "Дата увольнения")] %}
<option value="{{ value }}" {% if filters.sort == value %}selected{% endif %}>Сортировка: {{ label }}</option>
{% endfor %}
</select>
<select class="directory__input" name="direction">
<option value="asc" {% if filters.direction == "asc" %}selected{% endif %}>По возрастанию</option>
<option value="desc" {% if filters.direction == "desc" %}selected{% endif %}>По убыванию</option>
</select>
<select class="directory__input" name="limit" onchange="this.form.offset.value = 0; this.form.submit()">
{% for value in [25, 50, 100] %}
<option value="{{ value }}" {% if filters.limit == value %}selected{% endif %}>На странице: {{ value }}</option>
{% endfor %}
</select>
<input type="hidden" name="offset" value="{{ filters.offset }}">
<button class="button" type="submit">Применить</button>
</form>
<div class="directory__table-wrap">
<table class="directory-table" data-directory-table>
<thead>
<tr>
<th class="directory-table__head" data-column="full_name">ФИО</th>
<th class="directory-table__head" data-column="status">Статус</th>
<th class="directory-table__head" data-column="positions">Должности</th>
<th class="directory-table__head" data-column="hse_start_year">Год начала</th>
<th class="directory-table__head" data-column="email">Email</th>
<th class="directory-table__head" data-column="phone">Телефон</th>
<th class="directory-table__head" data-column="address">Адрес</th>
<th class="directory-table__head" data-column="publications_count">Публикации</th>
<th class="directory-table__head" data-column="courses_count">Курсы</th>
<th class="directory-table__head" data-column="first_seen_at">Впервые найден</th>
<th class="directory-table__head" data-column="last_seen_at">Последний раз найден</th>
<th class="directory-table__head" data-column="dismissed_at">Дата увольнения</th>
<th class="directory-table__head" data-column="profile">Профиль</th>
</tr>
</thead>
<tbody>
{% for employee in page.employees %}
<tr class="directory-table__row" data-row-href="/admin/employees/{{ employee.id }}">
<td class="directory-table__cell" data-column="full_name">{{ employee.full_name or "Без имени" }}</td>
<td class="directory-table__cell" data-column="status"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status_display }}</span></td>
<td class="directory-table__cell" data-column="positions">{{ employee.positions_text }}</td>
<td class="directory-table__cell" data-column="hse_start_year">{{ employee.hse_start_year or "" }}</td>
<td class="directory-table__cell" data-column="email">{{ employee.email_text }}</td>
<td class="directory-table__cell" data-column="phone">{{ employee.phone_text }}</td>
<td class="directory-table__cell" data-column="address">{{ employee.address or "" }}</td>
<td class="directory-table__cell" data-column="publications_count">{{ employee.publications_count }}</td>
<td class="directory-table__cell" data-column="courses_count">{{ employee.courses_count }}</td>
<td class="directory-table__cell" data-column="first_seen_at">{{ employee.first_seen_display }}</td>
<td class="directory-table__cell" data-column="last_seen_at">{{ employee.last_seen_display }}</td>
<td class="directory-table__cell" data-column="dismissed_at">{{ employee.dismissed_display }}</td>
<td class="directory-table__cell" data-column="profile"><a class="admin__link" href="{{ employee.canonical_url }}">Открыть</a></td>
</tr>
{% else %}
<tr><td class="directory-table__empty" colspan="13">По этим фильтрам сотрудники не найдены.</td></tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="directory__pagination">
{% set prev_offset = filters.offset - filters.limit %}
{% set next_offset = filters.offset + filters.limit %}
{% if filters.offset > 0 %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=prev_offset) }}">Назад</a>
{% endif %}
<span class="directory__page">Страница {{ page.page }}{% if page.pages %} из {{ page.pages }}{% endif %}</span>
{% if next_offset < page.total %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=next_offset) }}">Вперед</a>
{% endif %}
</div>
</section>
<div class="columns-modal" data-columns-modal hidden>
<div class="columns-modal__backdrop" data-columns-close></div>
<section class="columns-modal__panel" aria-label="Настройка колонок">
<div class="columns-modal__header">
<h3 class="columns-modal__title">Отображаемые колонки</h3>
<button class="button button--ghost" type="button" data-columns-close>Закрыть</button>
</div>
<div class="columns-modal__grid">
{% for key, label in [("full_name", "ФИО"), ("status", "Статус"), ("positions", "Должности"), ("hse_start_year", "Год начала"), ("email", "Email"), ("phone", "Телефон"), ("address", "Адрес"), ("publications_count", "Публикации"), ("courses_count", "Курсы"), ("first_seen_at", "Впервые найден"), ("last_seen_at", "Последний раз найден"), ("dismissed_at", "Дата увольнения"), ("profile", "Профиль")] %}
<label class="columns-modal__option"><input class="columns-modal__checkbox" type="checkbox" value="{{ key }}" data-column-toggle> {{ label }}</label>
{% endfor %}
</div>
</section>
</div>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -1,26 +1,226 @@
{% extends "base.html" %}
{% block title %}{{ employee.full_name }} · MIEM Employees{% endblock %}
{% block title %}{{ employee_view.full_name }} · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">{{ employee.full_name or employee.profile_key }}</h2>
<p><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></p>
<p><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></p>
<h3>Tabs</h3>
<ul>
{% for tab in employee.tabs %}
<li><a class="admin__link" href="{{ tab.href }}">{{ tab.title }}</a></li>
<section class="employee-card">
<div class="employee-card__header">
<div class="employee-card__identity">
<h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2>
<span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status_display }}</span>
</div>
<a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a>
</div>
<section class="employee-card__section">
<h3 class="employee-section__title">Основная информация</h3>
<dl class="employee-card__meta">
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Должности</dt>
<dd class="employee-card__meta-value">
{% if employee_view.positions %}
<ul class="employee-card__list">
{% for position in employee_view.positions %}
<li class="employee-card__list-item">{{ position }}</li>
{% endfor %}
</ul>
<h3>Current data</h3>
<pre class="code">{{ employee.current_data | tojson(indent=2) }}</pre>
{% else %}
Не указано
{% endif %}
</dd>
</div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Год начала работы в ВШЭ</dt><dd class="employee-card__meta-value">{{ employee_view.hse_start_year or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Тип профиля</dt><dd class="employee-card__meta-value">{{ employee_view.profile_type or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">ID профиля</dt><dd class="employee-card__meta-value">{{ employee_view.profile_id or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Впервые найден</dt><dd class="employee-card__meta-value">{{ employee_view.first_seen_display }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Последний раз найден</dt><dd class="employee-card__meta-value">{{ employee_view.last_seen_display }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Дата увольнения</dt><dd class="employee-card__meta-value">{{ employee_view.dismissed_display }}</dd></div>
</dl>
</section>
<section class="employee-card__section">
<h3 class="employee-section__title">Контакты</h3>
<dl class="employee-card__meta">
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Email</dt>
<dd class="employee-card__meta-value">
{% if employee_view.contacts.emails %}
<ul class="employee-card__list">
{% for email in employee_view.contacts.emails %}
<li class="employee-card__list-item"><a class="admin__link" href="mailto:{{ email }}">{{ email }}</a></li>
{% endfor %}
</ul>
{% else %}
Не указано
{% endif %}
</dd>
</div>
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Телефоны</dt>
<dd class="employee-card__meta-value">{{ employee_view.contacts.phones | join(", ") if employee_view.contacts.phones else "Не указано" }}</dd>
</div>
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Адрес</dt>
<dd class="employee-card__meta-value">{{ employee_view.contacts.address or "Не указано" }}</dd>
</div>
{% if employee_view.contacts.contact_items %}
<div class="employee-card__meta-item employee-card__meta-item--wide">
<dt class="employee-card__meta-label">Прочее</dt>
<dd class="employee-card__meta-value">
<ul class="employee-card__list">
{% for item in employee_view.contacts.contact_items %}
<li class="employee-card__list-item">{{ item }}</li>
{% endfor %}
</ul>
</dd>
</div>
{% endif %}
</dl>
</section>
{% if employee_view.external_ids %}
<section class="employee-card__section">
<h3 class="employee-section__title">Внешние идентификаторы</h3>
<ul class="employee-card__list">
{% for external_id in employee_view.external_ids %}
<li class="employee-card__list-item">
<strong>{{ external_id.system }}:</strong>
{% if external_id.url %}
<a class="admin__link" href="{{ external_id.url }}">{{ external_id.value }}</a>
{% else %}
{{ external_id.value }}
{% endif %}
</li>
{% endfor %}
</ul>
</section>
{% endif %}
<section class="employee-card__section">
<h3 class="employee-section__title">Разделы профиля</h3>
{% if employee_view.sections %}
<div class="employee-card__sections">
{% for section in employee_view.sections %}
<article class="employee-section">
<div class="employee-section__header">
<h4 class="employee-section__title">{{ section.title }}</h4>
<span class="employee-section__type">{{ section.type }}</span>
</div>
{% if section.type == "year_blocks" and section.year_entries %}
<ul class="employee-card__list">
{% for entry in section.year_entries %}
<li class="employee-card__list-item">{% if entry.year %}<strong>{{ entry.year }}:</strong> {% endif %}{{ entry.text }}</li>
{% endfor %}
</ul>
{% elif section.type == "publications" and section.publications %}
{% if section.publications_count %}<p class="employee-section__note">Всего: {{ section.publications_count }}</p>{% endif %}
<ul class="employee-card__list">
{% for publication in section.publications %}
<li class="employee-card__list-item">
{% if publication.url %}
<a class="admin__link" href="{{ publication.url }}">{{ publication.title }}</a>
{% else %}
{{ publication.title }}
{% endif %}
{% if publication.text and publication.text != publication.title %}<div class="employee-section__text">{{ publication.text }}</div>{% endif %}
</li>
{% endfor %}
</ul>
{% elif section.type == "courses_by_year" and section.courses %}
{% if section.academic_year %}<p class="employee-section__note">Учебный год: {{ section.academic_year }}</p>{% endif %}
<ul class="employee-card__list">
{% for course in section.courses %}
<li class="employee-card__list-item">
{% if course.url %}
<a class="admin__link" href="{{ course.url }}">{{ course.title }}</a>
{% else %}
{{ course.title }}
{% endif %}
</li>
{% endfor %}
</ul>
{% elif section.type == "graduation_theses" and section.theses %}
{% if section.theses_count %}<p class="employee-section__note">Всего: {{ section.theses_count }}</p>{% endif %}
<ul class="employee-card__list">
{% for thesis in section.theses %}
<li class="employee-card__list-item">
{% if thesis.student %}<strong>{{ thesis.student }}</strong>{% endif %}
{% if thesis.title %}
<div class="employee-section__text">
{% if thesis.project_url %}
<a class="admin__link" href="{{ thesis.project_url }}">{{ thesis.title }}</a>
{% else %}
{{ thesis.title }}
{% endif %}
</div>
{% endif %}
<div class="employee-section__meta">
{% if thesis.defense_year %}<span class="employee-section__meta-item">Год защиты: {{ thesis.defense_year }}</span>{% endif %}
{% if thesis.level %}<span class="employee-section__meta-item">{{ thesis.level }}</span>{% endif %}
{% if thesis.rating is not none %}<span class="employee-section__meta-item">Оценка: {{ thesis.rating }}</span>{% endif %}
{% if thesis.program %}
<span class="employee-section__meta-item">
{% if thesis.program_url %}<a class="admin__link" href="{{ thesis.program_url }}">{{ thesis.program }}</a>{% else %}{{ thesis.program }}{% endif %}
</span>
{% endif %}
</div>
</li>
{% endfor %}
</ul>
{% elif section.type == "table" and section.table %}
<div class="employee-section__table-wrap">
<table class="employee-section__table">
{% if section.table.headers %}
<thead><tr>{% for header in section.table.headers %}<th class="employee-section__head">{{ header }}</th>{% endfor %}</tr></thead>
{% endif %}
<tbody>
{% for row in section.table.rows %}
<tr>
{% for cell in row.cells %}
<td class="employee-section__cell">{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
{% if section.paragraphs %}
{% for paragraph in section.paragraphs %}
<p class="employee-section__text">{{ paragraph }}</p>
{% endfor %}
{% endif %}
{% if section.list_items %}
<ul class="employee-card__list">
{% for item in section.list_items %}
<li class="employee-card__list-item">{{ item }}</li>
{% endfor %}
</ul>
{% endif %}
{% endif %}
{% if section.links and section.type not in ["courses_by_year", "graduation_theses"] %}
<div class="employee-section__links">
{% for link in section.links %}
<a class="employee-section__link" href="{{ link.url }}">{{ link.text }}</a>
{% endfor %}
</div>
{% endif %}
</article>
{% endfor %}
</div>
{% else %}
<p class="employee-section__text">Разделы профиля не найдены.</p>
{% endif %}
</section>
</section>
<section class="panel">
<h2 class="panel__title">Snapshots</h2>
<h2 class="panel__title">Снапшоты</h2>
<table class="table">
<thead><tr><th class="table__head">Captured</th><th class="table__head">Checksum</th><th class="table__head">Parser</th></tr></thead>
<thead><tr><th class="table__head">Дата</th><th class="table__head">Checksum</th><th class="table__head">Парсер</th></tr></thead>
<tbody>
{% for snapshot in snapshots %}
<tr><td class="table__cell">{{ snapshot.captured_at }}</td><td class="table__cell">{{ snapshot.checksum }}</td><td class="table__cell">{{ snapshot.parser_version }}</td></tr>
<tr><td class="table__cell">{{ snapshot.captured_display }}</td><td class="table__cell">{{ snapshot.checksum }}</td><td class="table__cell">{{ snapshot.parser_version }}</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -1,29 +0,0 @@
{% extends "base.html" %}
{% block title %}Employees · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">Employees</h2>
<form class="form" method="get" action="/admin/employees">
<input class="form__input" name="q" value="{{ q }}" placeholder="Name or URL">
<select class="form__select" name="status">
<option value="" {% if not status %}selected{% endif %}>All</option>
<option value="active" {% if status == "active" %}selected{% endif %}>Active</option>
<option value="dismissed" {% if status == "dismissed" %}selected{% endif %}>Dismissed</option>
</select>
<button class="button" type="submit">Search</button>
</form>
<table class="table">
<thead><tr><th class="table__head">Name</th><th class="table__head">Status</th><th class="table__head">Last seen</th><th class="table__head">Profile</th></tr></thead>
<tbody>
{% for employee in employees %}
<tr>
<td class="table__cell"><a class="admin__link" href="/admin/employees/{{ employee.id }}">{{ employee.full_name or employee.profile_key }}</a></td>
<td class="table__cell"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
<td class="table__cell">{{ employee.last_seen_at }}</td>
<td class="table__cell"><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}

View File

@@ -3,18 +3,18 @@
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Login · MIEM Employees</title>
<title>Вход · MIEM Employees</title>
<link rel="stylesheet" href="/static/admin.css">
</head>
<body class="admin">
<main class="admin__main">
<section class="panel">
<h1 class="panel__title">Admin login</h1>
<h1 class="panel__title">Вход в админку</h1>
{% if error %}<p>{{ error }}</p>{% endif %}
<form class="form" method="post" action="/admin/login">
<label class="form__label">Login <input class="form__input" name="username" autocomplete="username"></label>
<label class="form__label">Password <input class="form__input" name="password" type="password" autocomplete="current-password"></label>
<button class="button" type="submit">Sign in</button>
<label class="form__label">Логин <input class="form__input" name="username" autocomplete="username"></label>
<label class="form__label">Пароль <input class="form__input" name="password" type="password" autocomplete="current-password"></label>
<button class="button" type="submit">Войти</button>
</form>
</section>
</main>

View File

@@ -0,0 +1,64 @@
{% extends "base.html" %}
{% block title %}Запуск {{ run.id }} · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<div class="progress-panel__header">
<div>
<h2 class="panel__title">Запуск {{ run.id }}</h2>
<p class="progress-panel__empty">{{ run.started_display }} · {{ run.status_display }}</p>
</div>
<a class="admin__link" href="/admin/runs">Все запуски</a>
</div>
<div class="stats-strip">
<div class="stats-strip__item"><span class="stats-strip__label">Найдено</span><span class="stats-strip__value">{{ run.found_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Обработано</span><span class="stats-strip__value">{{ run.parsed_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Новые</span><span class="stats-strip__value">{{ run.new_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Потеряшки</span><span class="stats-strip__value">{{ run.changes.missing_from_source | length }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Уволены</span><span class="stats-strip__value">{{ run.dismissed_count }}</span></div>
<div class="stats-strip__item"><span class="stats-strip__label">Ошибки</span><span class="stats-strip__value">{{ run.error_count }}</span></div>
</div>
{% if not run.changes_detail_available %}
<p class="progress-panel__empty">Детализация сотрудников для этого запуска недоступна. Она сохраняется только для новых запусков после обновления.</p>
{% endif %}
</section>
{% for group, title in [("new", "Новые сотрудники"), ("missing_from_source", "Потеряшки"), ("dismissed", "Уволенные")] %}
<section class="panel"{% if group == "new" %} id="new-employees"{% endif %}>
<h2 class="panel__title">{{ title }}</h2>
{% set items = run.changes[group] %}
{% if items %}
<table class="table">
<thead><tr><th class="table__head">ФИО</th><th class="table__head">Профиль</th><th class="table__head">Проверка</th><th class="table__head">Комментарий</th></tr></thead>
<tbody>
{% for item in items %}
<tr>
<td class="table__cell">{% if item.employee_id %}<a class="admin__link" href="/admin/employees/{{ item.employee_id }}">{{ item.full_name or item.profile_key }}</a>{% else %}{{ item.full_name or item.profile_key }}{% endif %}</td>
<td class="table__cell"><a class="admin__link" href="{{ item.profile_url }}">{{ item.profile_url }}</a></td>
<td class="table__cell">{{ item.profile_available_display }}</td>
<td class="table__cell">{{ item.message or "" }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p class="progress-panel__empty">Нет записей.</p>
{% endif %}
</section>
{% endfor %}
<section class="panel">
<h2 class="panel__title">Ошибки запуска</h2>
{% if run.errors %}
<table class="table">
<thead><tr><th class="table__head">Профиль</th><th class="table__head">Ошибка</th><th class="table__head">Время</th></tr></thead>
<tbody>
{% for error in run.errors %}
<tr><td class="table__cell">{{ error.profile_url or "" }}</td><td class="table__cell">{{ error.error_type }}: {{ error.message }}</td><td class="table__cell">{{ error.created_display }}</td></tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p class="progress-panel__empty">Ошибок нет.</p>
{% endif %}
</section>
{% endblock %}

View File

@@ -1,22 +1,52 @@
{% extends "base.html" %}
{% block title %}Runs · MIEM Employees{% endblock %}
{% block title %}Запуски · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">Crawl runs</h2>
<form method="post" action="/admin/runs"><button class="button" type="submit">Start crawl</button></form>
<div class="progress-panel__header">
<h2 class="panel__title">Запуски парсинга</h2>
<form method="post" action="/admin/runs"><button class="button" type="submit">Запустить парсинг</button></form>
</div>
{% set run = runs[0] if runs else none %}
{% if run %}
{% set processed = run.parsed_count + run.error_count %}
{% set percent = ((processed / run.found_count) * 100) | round(1) if run.found_count else 0 %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status_display }}</span>
<span>обработано: <span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span></span>
<span>ошибок: <span data-progress-errors>{{ run.error_count }}</span></span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ percent }}%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>{{ percent }}</span>%</div>
</div>
{% else %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>Ожидание</span>
<span>обработано: <span data-progress-processed>0</span> / <span data-progress-found>0</span></span>
<span>ошибок: <span data-progress-errors>0</span></span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: 0%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>0</span>%</div>
</div>
{% endif %}
<table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Found</th><th class="table__head">Parsed</th><th class="table__head">Errors</th><th class="table__head">Dismissed</th></tr></thead>
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Найдено</th><th class="table__head">Обработано</th><th class="table__head">Новые</th><th class="table__head">Ошибки</th><th class="table__head">Уволены</th><th class="table__head">Старт</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td></tr>
<tr class="table__row" onclick="window.location.href='/admin/runs/{{ run.id }}'" onkeydown="if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); window.location.href='/admin/runs/{{ run.id }}'; }" role="link" tabindex="0"><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
<section class="panel">
<h2 class="panel__title">Recent errors</h2>
<h2 class="panel__title">Последние ошибки</h2>
<table class="table">
<thead><tr><th class="table__head">Run</th><th class="table__head">Profile</th><th class="table__head">Error</th></tr></thead>
<thead><tr><th class="table__head">Запуск</th><th class="table__head">Профиль</th><th class="table__head">Ошибка</th></tr></thead>
<tbody>
{% for error in errors %}
<tr><td class="table__cell">{{ error.crawl_run_id }}</td><td class="table__cell">{{ error.profile_url }}</td><td class="table__cell">{{ error.error_type }}: {{ error.message }}</td></tr>
@@ -25,3 +55,6 @@
</table>
</section>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -1,3 +1,3 @@
APP_VERSION = "0.1.0"
FRONTEND_VERSION = "0.1.0"
BACKEND_VERSION = "0.1.0"
APP_VERSION = "0.4.6"
FRONTEND_VERSION = "0.4.6"
BACKEND_VERSION = "0.4.6"

View File

@@ -7,8 +7,6 @@ services:
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-miem_password}
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- "${POSTGRES_PORT:-5432}:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-miem} -d ${POSTGRES_DB:-miem_workers}"]
interval: 10s
@@ -22,7 +20,7 @@ services:
environment:
DATABASE_URL: postgresql+psycopg://${POSTGRES_USER:-miem}:${POSTGRES_PASSWORD:-miem_password}@postgres:5432/${POSTGRES_DB:-miem_workers}
ports:
- "${API_PORT:-8000}:8000"
- "127.0.0.1:${API_PORT:-8000}:8000"
depends_on:
postgres:
condition: service_healthy
@@ -44,7 +42,7 @@ services:
environment:
DATABASE_URL: postgresql+psycopg://${POSTGRES_USER:-miem}:${POSTGRES_PASSWORD:-miem_password}@postgres:5432/${POSTGRES_DB:-miem_workers}
ports:
- "${MCP_PORT:-8001}:8000"
- "127.0.0.1:${MCP_PORT:-8001}:8000"
depends_on:
postgres:
condition: service_healthy

View File

@@ -13,6 +13,7 @@ CREATE TABLE IF NOT EXISTS crawl_runs (
finished_at TIMESTAMPTZ,
found_count INTEGER NOT NULL DEFAULT 0,
parsed_count INTEGER NOT NULL DEFAULT 0,
new_count INTEGER NOT NULL DEFAULT 0,
error_count INTEGER NOT NULL DEFAULT 0,
dismissed_count INTEGER NOT NULL DEFAULT 0,
message TEXT

View File

@@ -0,0 +1,2 @@
ALTER TABLE crawl_runs
ADD COLUMN IF NOT EXISTS new_count INTEGER NOT NULL DEFAULT 0;

View File

@@ -0,0 +1,21 @@
CREATE TABLE IF NOT EXISTS crawl_run_employee_changes (
id SERIAL PRIMARY KEY,
crawl_run_id INTEGER NOT NULL REFERENCES crawl_runs(id),
employee_id INTEGER REFERENCES employees(id),
profile_key VARCHAR(255) NOT NULL,
profile_url TEXT NOT NULL,
full_name TEXT,
change_type VARCHAR(32) NOT NULL,
profile_available BOOLEAN,
message TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_run_id
ON crawl_run_employee_changes (crawl_run_id);
CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_employee_id
ON crawl_run_employee_changes (employee_id);
CREATE INDEX IF NOT EXISTS ix_crawl_run_employee_changes_change_type
ON crawl_run_employee_changes (change_type);

View File

@@ -1,6 +1,6 @@
[project]
name = "miem-workers"
version = "0.1.0"
version = "0.4.6"
description = "MIEM employees parser, admin API, and MCP server"
requires-python = ">=3.11"
dependencies = [

250
tests/test_admin_data.py Normal file
View File

@@ -0,0 +1,250 @@
from datetime import datetime, timezone
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
from app.services.admin_data import (
employee_detail_payload,
employee_display_payload,
format_admin_datetime,
list_employees_page,
run_detail_payload,
run_payload,
stats_payload,
)
def test_format_admin_datetime_handles_datetime_string_and_none():
value = datetime(2026, 4, 28, 17, 13, 34, tzinfo=timezone.utc)
assert format_admin_datetime(value) == "28.04.2026 20:13"
assert format_admin_datetime("2026-04-28T17:13:34.448605+00:00") == "28.04.2026 20:13"
assert format_admin_datetime(None) == "Не указано"
def test_employee_display_payload_extracts_common_fields(db_session):
employee = Employee(
profile_key="staff:person",
canonical_url="https://www.hse.ru/staff/person",
full_name="Person Name",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={
"positions": ["Professor"],
"hse_start_year": 2024,
"contacts": {"emails": ["person@hse.ru"], "phones": ["+79990000000"], "address": "Moscow"},
"sections": [
{"type": "publications", "publications": [{"title": "Paper"}]},
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
],
},
)
payload = employee_display_payload(employee)
assert payload["positions_text"] == "Professor"
assert payload["status_display"] == "Работает"
assert payload["email_text"] == "person@hse.ru"
assert payload["publications_count"] == 1
assert payload["courses_count"] == 1
assert payload["first_seen_display"] != "Не указано"
def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
employee = Employee(
profile_key="staff:person",
profile_type="staff",
profile_id="person",
canonical_url="https://www.hse.ru/staff/person",
full_name="Person Name",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={
"positions": ["Professor"],
"hse_start_year": 2024,
"contacts": {
"emails": ["person@hse.ru"],
"phones": ["+79990000000"],
"address": "Moscow",
"items": [{"raw": "consultation hours"}],
},
"external_ids": [{"system": "ORCID", "value": "0000", "url": "https://orcid.org/0000"}],
"sections": [
{
"title": "Education",
"type": "year_blocks",
"year_entries": [{"year": 2020, "text": "Master degree"}],
},
{
"title": "Publications",
"type": "publications",
"publications": [{"title": "Paper", "text": "Paper details", "url": "https://example.test/paper"}],
},
{
"title": "Courses",
"type": "courses_by_year",
"academic_year": "2025/2026",
"courses": [{"title": "Course", "url": "https://example.test/course"}],
},
{
"title": "ВКР",
"type": "graduation_theses",
"theses_count": 1,
"theses": [
{
"student": "Student Name",
"title": "Thesis title",
"defense_year": 2025,
"project_url": "https://www.hse.ru/edu/vkr/1",
}
],
},
{
"title": "Fallback",
"type": "generic",
"raw_text": "Fallback text",
},
],
},
)
payload = employee_detail_payload(employee)
assert payload["contacts"]["emails"] == ["person@hse.ru"]
assert payload["contacts"]["contact_items"] == ["consultation hours"]
assert payload["external_ids"][0]["system"] == "ORCID"
assert payload["sections"][0]["year_entries"][0]["text"] == "Master degree"
assert payload["sections"][1]["publications"][0]["title"] == "Paper"
assert payload["sections"][2]["courses"][0]["title"] == "Course"
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
def test_employee_payloads_tolerate_malformed_current_data(db_session):
employee = Employee(
profile_key="staff:broken",
canonical_url="https://www.hse.ru/staff/broken",
full_name="Broken Data",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data="not-a-dict",
)
display = employee_display_payload(employee)
detail = employee_detail_payload(employee)
assert display["positions"] == []
assert display["email_text"] == ""
assert detail["contacts"]["emails"] == []
assert detail["contacts"]["contact_items"] == []
assert detail["sections"] == []
def test_list_employees_page_filters_sorts_and_paginates(db_session):
db_session.add(
Employee(
profile_key="staff:b",
canonical_url="https://www.hse.ru/staff/b",
full_name="Beta",
status="dismissed",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": []}},
)
)
db_session.add(
Employee(
profile_key="staff:a",
canonical_url="https://www.hse.ru/staff/a",
full_name="Alpha",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": ["alpha@hse.ru"]}},
)
)
db_session.commit()
page = list_employees_page(db_session, status="active", sort="full_name", direction="asc", limit=10)
assert page["total"] == 1
assert page["employees"][0]["full_name"] == "Alpha"
assert page["limit"] == 50
def test_stats_payload_uses_latest_run_new_count(db_session):
db_session.add(
Employee(
profile_key="staff:a",
canonical_url="https://www.hse.ru/staff/a",
full_name="Alpha",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=3))
db_session.commit()
payload = stats_payload(db_session)
assert payload["total"] == 1
assert payload["active"] == 1
assert payload["new_in_last_run"] == 3
def test_run_payload_calculates_progress():
run = CrawlRun(
source_url="https://miem.hse.ru/persons",
status="running",
found_count=10,
parsed_count=4,
error_count=1,
)
payload = run_payload(run)
assert payload["processed_count"] == 5
assert payload["progress_percent"] == 50.0
assert payload["status_display"] == "Выполняется"
def test_run_detail_payload_groups_changes_and_handles_old_runs(db_session):
old_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed")
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
employee = Employee(
profile_key="staff:new",
canonical_url="https://www.hse.ru/staff/new",
full_name="New Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
db_session.add_all([old_run, run, employee])
db_session.commit()
db_session.add(
CrawlRunEmployeeChange(
crawl_run_id=run.id,
employee_id=employee.id,
profile_key=employee.profile_key,
profile_url=employee.canonical_url,
full_name=employee.full_name,
change_type="new",
profile_available=True,
message="added",
)
)
db_session.add(
CrawlError(crawl_run_id=run.id, profile_url=employee.canonical_url, error_type="ValueError", message="bad")
)
db_session.commit()
payload = run_detail_payload(db_session, run)
old_payload = run_detail_payload(db_session, old_run)
assert payload["changes_detail_available"] is True
assert payload["changes"]["new"][0]["full_name"] == "New Person"
assert payload["errors"][0]["error_type"] == "ValueError"
assert old_payload["changes_detail_available"] is False
assert old_payload["changes"]["new"] == []

View File

@@ -0,0 +1,93 @@
from pathlib import Path
def test_base_navigation_is_russian_and_has_no_legacy_employees_link():
template = Path("app/templates/base.html").read_text(encoding="utf-8")
assert "Обзор" in template
assert "Сотрудники" in template
assert "Запуски" in template
assert "Выйти" in template
assert '<a class="admin__brand-link" href="/admin">MIEM Employees</a>' in template
assert ">Employees<" not in template
assert "/admin/employees" not in template
def test_directory_template_is_russian_and_uses_display_dates():
template = Path("app/templates/directory.html").read_text(encoding="utf-8")
assert "Сотрудники" in template
assert "Колонки" in template
assert "Применить" in template
assert "На странице: {{ value }}" in template
assert "{% for value in [25, 50, 100] %}" in template
assert "Найдено:" in template
assert "employee.first_seen_display" in template
assert "employee.last_seen_display" in template
assert "employee.dismissed_display" in template
assert "Directory" not in template
assert "employees found" not in template
def test_admin_employees_route_redirects_to_directory():
source = Path("app/admin.py").read_text(encoding="utf-8")
assert 'RedirectResponse("/admin/directory", status_code=303)' in source
def test_dashboard_limits_latest_runs_to_five():
source = Path("app/admin.py").read_text(encoding="utf-8")
assert "order_by(desc(CrawlRun.started_at)).limit(5)" in source
assert "order_by(desc(CrawlRun.started_at)).limit(10)" not in source
def test_runs_template_links_to_run_detail():
template = Path("app/templates/runs.html").read_text(encoding="utf-8")
assert 'onclick="window.location.href=\'/admin/runs/{{ run.id }}\'"' in template
assert "onkeydown=\"if (event.key === 'Enter' || event.key === ' ')" in template
assert 'role="link"' in template
assert 'tabindex="0"' in template
assert 'data-row-href="/admin/runs/{{ run.id }}"' not in template
assert '<a class="admin__link" href="/admin/runs/{{ run.id }}">' not in template
def test_run_detail_template_extends_base_and_shows_change_groups():
template = Path("app/templates/run_detail.html").read_text(encoding="utf-8")
assert '{% extends "base.html" %}' in template
assert 'id="new-employees"' in template
assert "Новые сотрудники" in template
assert "Потеряшки" in template
assert "Уволенные" in template
assert "Детализация сотрудников для этого запуска недоступна" in template
def test_dashboard_metric_cards_link_to_admin_targets():
template = Path("app/templates/dashboard.html").read_text(encoding="utf-8")
assert 'href="/admin/directory"' in template
assert 'href="/admin/directory?status=active"' in template
assert '/admin/runs/{{ latest_run.id }}#new-employees' in template
assert 'href="/admin/directory?status=dismissed"' in template
assert 'href="/admin/runs"' in template
def test_dashboard_latest_run_rows_link_to_run_detail():
template = Path("app/templates/dashboard.html").read_text(encoding="utf-8")
assert 'onclick="window.location.href=\'/admin/runs/{{ run.id }}\'"' in template
assert "onkeydown=\"if (event.key === 'Enter' || event.key === ' ')" in template
assert 'role="link"' in template
assert 'tabindex="0"' in template
assert 'data-row-href="/admin/runs/{{ run.id }}"' not in template
assert '<a class="admin__link" href="/admin/runs/{{ run.id }}">' not in template
def test_admin_js_supports_keyboard_activation_for_clickable_rows():
source = Path("app/static/admin.js").read_text(encoding="utf-8")
assert 'addEventListener("keydown"' in source
assert '"Enter"' in source
assert '" "' in source

View File

@@ -8,7 +8,8 @@ from sqlalchemy.pool import StaticPool
from app.config import Settings, get_settings
from app.db import Base, get_db
from app.main import app
from app.models import Employee
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
from app.security import SESSION_COOKIE, sign_session
def test_health_returns_versions():
@@ -17,10 +18,10 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.1.0"
assert response.json()["backend_version"] == "0.4.6"
def test_mcp_requires_token_and_lists_tools():
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
@@ -37,19 +38,20 @@ def test_mcp_requires_token_and_lists_tools():
session.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
client = TestClient(app)
unauthorized = client.post("/mcp", json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})
authorized = client.post(
without_auth = client.post("/mcp", json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})
with_auth = client.post(
"/mcp",
headers={"Authorization": "Bearer secret"},
headers={"Authorization": "Bearer anything"},
json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
)
assert unauthorized.status_code == 401
assert authorized.status_code == 200
assert authorized.json()["result"]["tools"][0]["name"] == "search_employees"
assert without_auth.status_code == 200
assert with_auth.status_code == 200
assert without_auth.json()["result"]["tools"][0]["name"] == "search_employees"
assert any(tool["name"] == "get_crawl_run_details" for tool in without_auth.json()["result"]["tools"])
assert with_auth.json()["result"]["tools"] == without_auth.json()["result"]["tools"]
app.dependency_overrides.clear()
@@ -87,12 +89,10 @@ def test_mcp_search_employees_returns_matching_employee():
db.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
client = TestClient(app)
response = client.post(
"/mcp",
headers={"Authorization": "Bearer secret"},
json={
"jsonrpc": "2.0",
"id": 1,
@@ -105,3 +105,146 @@ def test_mcp_search_employees_returns_matching_employee():
assert "Сергеев Алексей Викторович" in response.json()["result"]["content"][0]["text"]
app.dependency_overrides.clear()
def test_mcp_get_crawl_run_details_returns_changes():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
employee = Employee(
profile_key="staff:new",
profile_type="staff",
profile_id="new",
canonical_url="https://www.hse.ru/staff/new",
full_name="New Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
session.add_all([run, employee])
session.commit()
session.add(
CrawlRunEmployeeChange(
crawl_run_id=run.id,
employee_id=employee.id,
profile_key=employee.profile_key,
profile_url=employee.canonical_url,
full_name=employee.full_name,
change_type="new",
profile_available=True,
message="added",
)
)
session.commit()
run_id = run.id
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
client = TestClient(app)
response = client.post(
"/mcp",
json={
"jsonrpc": "2.0",
"id": 1,
"method": "tools/call",
"params": {"name": "get_crawl_run_details", "arguments": {"run_id": run_id}},
},
)
assert response.status_code == 200
text = response.json()["result"]["content"][0]["text"]
assert "New Person" in text
assert "changes_detail_available" in text
app.dependency_overrides.clear()
def test_mcp_protected_resource_metadata_route_is_removed():
client = TestClient(app)
response = client.get("/.well-known/oauth-protected-resource")
assert response.status_code == 404
def test_api_employees_and_stats_require_admin_session():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()
db.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": ["alpha@hse.ru"]}, "sections": []},
)
)
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
db.add(run)
db.commit()
db.add(
CrawlRunEmployeeChange(
crawl_run_id=run.id,
employee_id=1,
profile_key="staff:alpha",
profile_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
change_type="new",
profile_available=True,
message="added",
)
)
db.commit()
run_id = run.id
db.close()
settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
def override_db():
session = Session()
try:
yield session
finally:
session.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: settings
client = TestClient(app)
client.cookies.set(SESSION_COOKIE, sign_session("admin", settings))
employees = client.get("/api/employees", params={"q": "Alpha", "has_email": True})
stats = client.get("/api/stats")
run_details = client.get(f"/api/crawl-runs/{run_id}")
assert employees.status_code == 200
assert employees.json()["total"] == 1
assert stats.status_code == 200
assert stats.json()["new_in_last_run"] == 1
assert run_details.status_code == 200
assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person"
app.dependency_overrides.clear()

13
tests/test_config.py Normal file
View File

@@ -0,0 +1,13 @@
from app.config import Settings
def test_empty_crawl_limit_is_treated_as_none():
settings = Settings(crawl_limit="")
assert settings.crawl_limit is None
def test_numeric_crawl_limit_is_parsed():
settings = Settings(crawl_limit="25")
assert settings.crawl_limit == 25

View File

@@ -1,10 +1,25 @@
from datetime import datetime, timezone
from app.models import Employee
from app.services.crawler import _mark_dismissed
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
from app.services.crawler import _mark_dismissed, _upsert_employee
def test_mark_dismissed_only_marks_missing_active_employees(db_session):
class FakeResponse:
def __init__(self, status_code):
self.status_code = status_code
class FakeSession:
def __init__(self, statuses):
self.statuses = statuses
def get(self, url, **_kwargs):
return FakeResponse(self.statuses[url])
def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.add(
Employee(
profile_key="staff:kept",
@@ -16,8 +31,8 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
)
db_session.add(
Employee(
profile_key="staff:gone",
canonical_url="https://www.hse.ru/staff/gone",
profile_key="staff:missing",
canonical_url="https://www.hse.ru/staff/missing",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
@@ -25,10 +40,74 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
)
db_session.commit()
dismissed = _mark_dismissed(db_session, {"staff:kept"})
dismissed = _mark_dismissed(
db_session,
run,
{"staff:kept"},
FakeSession({"https://www.hse.ru/staff/missing": 200}),
30,
)
assert dismissed == 0
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
missing = db_session.query(Employee).filter_by(profile_key="staff:missing").one()
assert missing.status == "active"
assert missing.dismissed_at is None
change = db_session.query(CrawlRunEmployeeChange).one()
assert change.change_type == "missing_from_source"
assert change.profile_available is True
def test_mark_dismissed_marks_missing_employee_when_profile_is_unavailable(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
employee = Employee(
profile_key="staff:gone",
canonical_url="https://www.hse.ru/staff/gone",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
db_session.add_all([run, employee])
db_session.commit()
dismissed = _mark_dismissed(
db_session,
run,
set(),
FakeSession({"https://www.hse.ru/staff/gone": 404}),
30,
)
assert dismissed == 1
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
assert gone.status == "dismissed"
assert gone.dismissed_at is not None
assert employee.status == "dismissed"
assert employee.dismissed_at is not None
change = db_session.query(CrawlRunEmployeeChange).one()
assert change.change_type == "dismissed"
assert change.profile_available is False
def test_upsert_employee_increments_new_count_and_records_change_for_new_employee(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
_upsert_employee(
db_session,
run,
{
"source_url": "https://www.hse.ru/staff/newperson",
"profile_type": "staff",
"profile_id": "newperson",
"full_name": "New Person",
"tabs": [],
"sections": [],
"parser_version": "0.2.0",
"_html": "<html></html>",
},
)
db_session.commit()
assert run.new_count == 1
change = db_session.query(CrawlRunEmployeeChange).one()
assert change.change_type == "new"
assert change.full_name == "New Person"

View File

@@ -0,0 +1,30 @@
from pathlib import Path
def test_employee_detail_template_is_human_readable():
template = Path("app/templates/employee_detail.html").read_text(encoding="utf-8")
assert "Current data" not in template
assert "<pre class=\"code\"" not in template
assert ">Tabs<" not in template
assert "contacts.items" not in template
assert "contacts.contact_items" in template
assert "section.items" not in template
assert "section.list_items" in template
assert "Основная информация" in template
assert "Контакты" in template
assert "Разделы профиля" in template
assert "graduation_theses" in template
assert "Год защиты" in template
assert "Parser version" not in template
assert "First seen" not in template
assert "Last seen" not in template
assert "Dismissed at" not in template
assert "Profile type" not in template
assert "Profile ID" not in template
assert "Впервые найден" in template
assert "Последний раз найден" in template
assert "Дата увольнения" in template
assert "Тип профиля" in template
assert "ID профиля" in template
assert "Снапшоты" in template

View File

@@ -1,9 +1,110 @@
from bs4 import BeautifulSoup
from app.parser.profile import extract_person_tabs
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
class FakeResponse:
def __init__(self, payload):
self.payload = payload
def raise_for_status(self):
return None
def json(self):
return self.payload
class FakeSession:
def __init__(self):
self.posts = []
self.gets = []
def post(self, url, **kwargs):
self.posts.append((url, kwargs))
return FakeResponse(
{
"status": "ok",
"result": {
"more": False,
"total": 1,
"items": [
{
"id": "888959076",
"type": "ARTICLE",
"title": "Дублирование пакетов",
"year": 2023,
"description": {"short": {"ru": "Информационные процессы. 2023."}},
}
],
},
}
)
def get(self, url, **kwargs):
self.gets.append((url, kwargs))
return FakeResponse(
{
"lang": "ru",
"success": True,
"data": [
{
"id": 1045750164,
"year": 2025,
"level": "Бакалавриат",
"title": "Аппаратно-программный комплекс защиты сети",
"rating": 8,
"student": "Лесняк Владислав Евгеньевич",
"learnProgram": {"title": "Информатика и вычислительная техника", "url": "https://hse.ru/ba/isct/"},
"orgUnit": {"title": "МИЭМ", "url": "https://www.hse.ru/org/url/59315150"},
"supervisors": [{"url": "https://www.hse.ru/org/persons/803294906", "name": "Борисов Сергей Петрович"}],
}
],
}
)
class GroupedPublicationsSession(FakeSession):
def post(self, url, **kwargs):
self.posts.append((url, kwargs))
return FakeResponse(
{
"status": "ok",
"result": {
"more": False,
"total": 1,
"groupType": 2,
"items": {
"year": {
"header": {"ru": "по году", "en": "by year"},
"criteria": {"year": []},
"items": {
"2011": [
{
"id": "146366790",
"type": "ARTICLE",
"title": "Развитие теории самосогласованного поля",
"year": 2011,
"description": {"short": {"ru": "Журнал физической химии 2011."}},
}
],
"2012": [
{
"id": "146367323",
"type": "ARTICLE",
"title": "Self-consistent field theory investigation",
"year": 2012,
"description": {"short": {"en": "Russian Journal of Physical Chemistry A 2012."}},
}
],
},
}
},
},
}
)
def test_normalize_profile_url_supports_staff_and_org_persons():
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
@@ -26,3 +127,60 @@ def test_extract_person_tabs_prefers_person_menu_addition():
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
soup = BeautifulSoup(
"""
<script src="/n/stat/publications/dist-w/publs.js" data-author="568398853" data-widget-name="AuthorSearch"></script>
<script src="/n/stat/vkr/app.js" data-api-url="/n/vkr/api/" data-person-id="803294906"></script>
""",
"html.parser",
)
session = FakeSession()
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/803294906",
{"User-Agent": "test"},
10,
[],
)
publications = next(section for section in sections if section["type"] == "publications")
theses = next(section for section in sections if section["type"] == "graduation_theses")
assert publications["publications_count"] == 1
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
assert theses["theses_count"] == 1
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
def test_enrich_sections_from_hse_widgets_loads_grouped_publications():
soup = BeautifulSoup(
"""
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
""",
"html.parser",
)
session = GroupedPublicationsSession()
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/133709486",
{"User-Agent": "test"},
10,
[],
)
publications = next(section for section in sections if section["type"] == "publications")
assert publications["publications_count"] == 2
assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"]
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790"
assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323"