Compare commits

...

13 Commits

Author SHA1 Message Date
Anton
cc9481fc6c fix: enrich HSE profile parsing with publications and theses 2026-04-29 14:15:29 +03:00
cf578ce699 Merge pull request 'fix: allow empty CRAWL_LIMIT env value' (#9) from fix/empty-crawl-limit-env into main
Reviewed-on: #9
2026-04-29 09:50:34 +00:00
Anton
765efa1a1c fix: allow empty CRAWL_LIMIT env value 2026-04-29 12:49:58 +03:00
86330885e3 Merge pull request 'fix: localize admin UI and simplify employees navigation' (#8) from fix/admin-russian-ux-cleanup into main
Reviewed-on: #8
2026-04-29 09:39:42 +00:00
Anton
866e2b44d5 fix: localize admin UI and simplify employees navigation 2026-04-29 12:39:16 +03:00
f411de740e Merge pull request 'fix: avoid Jinja dict method collisions in admin templates' (#7) from fix/jinja-dict-method-collisions into main
Reviewed-on: #7
2026-04-29 09:12:13 +00:00
Anton
cdfbb26875 fix: avoid Jinja dict method collisions in admin templates 2026-04-29 12:11:16 +03:00
5eaad38076 Merge pull request 'fix: avoid Jinja dict items collision in employee card' (#6) from fix/employee-card-contact-items into main
Reviewed-on: #6
2026-04-29 08:35:13 +00:00
Anton
af87fa8af3 fix: avoid Jinja dict items collision in employee card 2026-04-29 11:34:46 +03:00
26db5832fd Merge pull request 'fix: harden admin employee views against malformed data' (#5) from fix/admin-employee-view-malformed-data into main
Reviewed-on: #5
2026-04-29 08:12:29 +00:00
Anton
7530cbdb60 fix: harden admin employee views against malformed data 2026-04-29 11:11:57 +03:00
ce90414654 Merge pull request 'fix: make employee detail page human-readable' (#4) from feature/human-readable-employee-card into main
Reviewed-on: #4
2026-04-29 07:38:21 +00:00
Anton
755135d6ba fix: make employee detail page human-readable 2026-04-29 10:37:38 +03:00
23 changed files with 1264 additions and 168 deletions

1
.gitignore vendored
View File

@@ -4,6 +4,7 @@ __pycache__/
*.py[cod]
*.db
.pytest_cache/
pytest-cache-files-*/
.coverage
htmlcov/
postgres_data/

View File

@@ -110,4 +110,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql
docker compose down
```
Версия сервиса: `0.2.1`. Админка всегда показывает версии backend и frontend в footer.
Версия сервиса: `0.2.8`. Админка всегда показывает версии backend и frontend в footer.

View File

@@ -1,14 +1,14 @@
from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy import desc, func, or_, select
from sqlalchemy import desc, func, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlError, CrawlRun, Employee
from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
from app.services.admin_data import list_employees_page, run_payload, stats_payload
from app.services.admin_data import employee_detail_payload, format_admin_datetime, list_employees_page, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
@@ -22,8 +22,9 @@ def dashboard(request: Request, db: Session = Depends(get_db), settings: Setting
counts = stats_payload(db)
counts["runs"] = db.scalar(select(func.count()).select_from(CrawlRun)) or 0
counts["errors"] = db.scalar(select(func.count()).select_from(CrawlError)) or 0
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all()
return _render(request, "dashboard.html", {"counts": counts, "runs": runs, "latest_run": run_payload(runs[0]) if runs else None})
run_models = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all()
runs = [run_payload(run) for run in run_models]
return _render(request, "dashboard.html", {"counts": counts, "runs": runs, "latest_run": runs[0] if runs else None})
@router.get("/login", response_class=HTMLResponse)
@@ -57,18 +58,10 @@ def employees(
request: Request,
status: str | None = None,
q: str | None = None,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
stmt = select(Employee)
if status:
stmt = stmt.where(Employee.status == status)
if q:
pattern = f"%{q}%"
stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
items = db.scalars(stmt.order_by(Employee.full_name).limit(200)).all()
return _render(request, "employees.html", {"employees": items, "status": status or "", "q": q or ""})
return RedirectResponse("/admin/directory", status_code=303)
@router.get("/directory", response_class=HTMLResponse)
@@ -115,7 +108,7 @@ def directory(
"has_email": has_email or "",
"sort": sort,
"direction": direction,
"limit": limit,
"limit": page["limit"],
"offset": offset,
},
},
@@ -133,14 +126,26 @@ def employee_detail(
employee = db.get(Employee, employee_id)
if not employee:
return RedirectResponse("/admin/employees", status_code=303)
snapshots = sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
return _render(request, "employee_detail.html", {"employee": employee, "snapshots": snapshots})
snapshots = [
{
"captured_display": format_admin_datetime(snapshot.captured_at),
"checksum": snapshot.checksum,
"parser_version": snapshot.parser_version,
}
for snapshot in sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
]
return _render(
request,
"employee_detail.html",
{"employee": employee, "employee_view": employee_detail_payload(employee), "snapshots": snapshots},
)
@router.get("/runs", response_class=HTMLResponse)
def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings)
items = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(50)).all()
run_models = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(50)).all()
items = [run_payload(run) for run in run_models]
errors = db.scalars(select(CrawlError).order_by(desc(CrawlError.created_at)).limit(50)).all()
return _render(request, "runs.html", {"runs": items, "errors": errors})

View File

@@ -1,5 +1,5 @@
from functools import lru_cache
from pydantic import Field
from pydantic import Field, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -19,6 +19,13 @@ class Settings(BaseSettings):
session_secret: str = Field(default="dev-session-secret", min_length=8)
mcp_token: str = "dev-mcp-token"
@field_validator("crawl_limit", mode="before")
@classmethod
def empty_crawl_limit_as_none(cls, value):
if value == "":
return None
return value
@lru_cache
def get_settings() -> Settings:

View File

@@ -164,6 +164,7 @@ def parse_person_profile(
header = extract_person_header(soup, normalized_url)
tabs = extract_person_tabs(soup, normalized_url)
sections = extract_sections(soup, normalized_url)
sections = enrich_sections_from_hse_widgets(session, soup, normalized_url, headers, timeout, sections)
internal_links = [tab["href"] for tab in tabs if tab.get("href")]
return {
@@ -183,6 +184,25 @@ def parse_person_profile(
}
def enrich_sections_from_hse_widgets(
session: Session,
soup: BeautifulSoup,
source_url: str,
headers: dict[str, str],
timeout: int,
sections: list[dict],
) -> list[dict]:
enriched = list(sections)
publications = _load_widget_publications(session, soup, headers, timeout)
if publications:
enriched = _upsert_publications_section(enriched, publications)
theses = _load_widget_graduation_theses(session, soup, source_url, headers, timeout)
if theses:
enriched = _upsert_graduation_theses_section(enriched, theses)
return enriched
def _render_with_playwright(source_url: str, fallback_html: str) -> str:
try:
from playwright.sync_api import sync_playwright
@@ -206,6 +226,89 @@ def _render_with_playwright(source_url: str, fallback_html: str) -> str:
return fallback_html
def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: dict[str, str], timeout: int) -> list[dict]:
script = soup.select_one('script[data-widget-name="AuthorSearch"][data-author]')
if not script:
return []
author_id = normalize_ws(script.get("data-author"))
if not author_id:
return []
publications = []
page_id = 1
per_page = 100
while page_id <= 20:
payload = {
"type": "ANY",
"filterParams": (
f'"acceptLanguage":"ru"|"fullTextPublicEnabled": 1|'
f'"pubsAuthor": {author_id}|"widgetName": "AuthorSearch"'
),
"paginationParams": {
"publsSort": ["TITLE_ASC"],
"publsCount": per_page,
"pageId": page_id,
},
}
try:
response = session.post(
"https://publications.hse.ru/api/searchPubs",
json=payload,
headers=headers,
timeout=timeout,
)
response.raise_for_status()
data = response.json()
except Exception:
return publications
result = data.get("result") if isinstance(data, dict) else {}
items = result.get("items") if isinstance(result, dict) else []
if not isinstance(items, list) or not items:
break
publications.extend(_normalize_publication_item(item) for item in items if isinstance(item, dict))
total = int(result.get("total") or 0)
if not result.get("more") and len(publications) >= total:
break
page_id += 1
return _dedupe_publications(publications)
def _load_widget_graduation_theses(
session: Session,
soup: BeautifulSoup,
source_url: str,
headers: dict[str, str],
timeout: int,
) -> list[dict]:
script = soup.select_one('script[src*="/n/stat/vkr/app.js"][data-person-id]')
if not script:
return []
person_id = normalize_ws(script.get("data-person-id"))
api_url = normalize_ws(script.get("data-api-url")) or "/n/vkr/api/"
if not person_id:
return []
request_headers = {**headers, "x-portal-language": "ru"}
try:
response = session.get(
urljoin(source_url, api_url),
params={"supervisorId": person_id},
headers=request_headers,
timeout=timeout,
)
response.raise_for_status()
data = response.json()
except Exception:
return []
items = data.get("data") if isinstance(data, dict) else []
if not isinstance(items, list):
return []
return [_normalize_vkr_item(item, source_url) for item in items if isinstance(item, dict)]
def _collect_between_h2(start_h2: Tag) -> list[Tag | NavigableString | str]:
nodes = []
for sibling in start_h2.next_siblings:
@@ -353,6 +456,122 @@ def _parse_vkr_items(nodes: list) -> list[str]:
return [item for item in dict.fromkeys(items) if item]
def _normalize_publication_item(item: dict) -> dict:
publication_id = str(item.get("id") or "").strip()
title = _html_to_text(item.get("title"))
year = item.get("year")
publication_type = str(item.get("type") or "").strip() or None
description = item.get("description") if isinstance(item.get("description"), dict) else {}
short_description = _localized_value(description.get("short")) or _localized_value(description.get("shortLeft"))
text = normalize_ws(" ".join(part for part in [title, str(year or ""), short_description] if part))
return {
"id": publication_id or None,
"title": title or publication_id,
"year": year,
"type": publication_type,
"url": f"https://publications.hse.ru/view/{publication_id}" if publication_id else None,
"text": text or title or publication_id,
}
def _normalize_vkr_item(item: dict, source_url: str) -> dict:
thesis_id = item.get("id")
program = item.get("learnProgram") if isinstance(item.get("learnProgram"), dict) else {}
org_unit = item.get("orgUnit") if isinstance(item.get("orgUnit"), dict) else {}
supervisors = []
for supervisor in item.get("supervisors") or []:
if not isinstance(supervisor, dict):
continue
name = normalize_ws(supervisor.get("name"))
url = normalize_ws(supervisor.get("url"))
if name or url:
supervisors.append({"name": name or url, "url": url or None})
return {
"id": thesis_id,
"student": normalize_ws(item.get("student")),
"title": normalize_ws(item.get("title")),
"defense_year": item.get("year"),
"level": normalize_ws(item.get("level")),
"rating": item.get("rating"),
"project_url": urljoin(source_url, f"/edu/vkr/{thesis_id}") if thesis_id else None,
"program": normalize_ws(program.get("title")),
"program_url": urljoin(source_url, program.get("url")) if program.get("url") else None,
"org_unit": normalize_ws(org_unit.get("title")),
"org_unit_url": urljoin(source_url, org_unit.get("url")) if org_unit.get("url") else None,
"supervisors": supervisors,
"text": normalize_ws(" ".join(str(part) for part in [item.get("student"), item.get("title"), item.get("year")] if part)),
}
def _upsert_publications_section(sections: list[dict], publications: list[dict]) -> list[dict]:
merged = []
inserted = False
for section in sections:
if section.get("type") != "publications":
merged.append(section)
continue
existing = section.get("publications") or []
section = {
**section,
"publications_count": max(section.get("publications_count") or 0, len(publications)),
"publications": _dedupe_publications([*existing, *publications]),
}
section["items"] = [item["text"] for item in section["publications"] if item.get("text")]
merged.append(section)
inserted = True
if not inserted:
merged.append(
{
"title": "Публикации и исследования",
"slug": "publikacii_i_issledovaniya",
"type": "publications",
"raw_text": "",
"paragraphs": [],
"items": [item["text"] for item in publications if item.get("text")],
"links": [],
"publications_count": len(publications),
"publications": publications,
}
)
return merged
def _upsert_graduation_theses_section(sections: list[dict], theses: list[dict]) -> list[dict]:
section = {
"title": "Выпускные квалификационные работы студентов НИУ ВШЭ",
"slug": "vypusknye_kvalifikacionnye_raboty_studentov_niu_vshe",
"type": "graduation_theses",
"raw_text": "",
"paragraphs": [],
"items": [item["text"] for item in theses if item.get("text")],
"links": [{"text": item["title"], "url": item["project_url"]} for item in theses if item.get("title") and item.get("project_url")],
"theses_count": len(theses),
"theses": theses,
}
return [item for item in sections if item.get("type") != "graduation_theses"] + [section]
def _dedupe_publications(items: list[dict]) -> list[dict]:
seen = set()
unique = []
for item in items:
key = item.get("id") or item.get("url") or item.get("title")
if key and key not in seen:
seen.add(key)
unique.append(item)
return unique
def _html_to_text(value: object) -> str:
return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))
def _localized_value(value: object) -> str:
if isinstance(value, dict):
return normalize_ws(value.get("ru") or value.get("publ") or value.get("en"))
return normalize_ws(str(value or ""))
def _slugify(value: str) -> str:
cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
from datetime import date, datetime, time
from math import ceil
from typing import Any
from zoneinfo import ZoneInfo
from sqlalchemy import Select, Text, and_, desc, func, or_, select
from sqlalchemy.orm import Session
@@ -20,18 +21,20 @@ EMPLOYEE_SORTS = {
def employee_display_payload(employee: Employee) -> dict[str, Any]:
data = employee.current_data or {}
contacts = data.get("contacts") or {}
sections = data.get("sections") or []
emails = contacts.get("emails") or []
phones = contacts.get("phones") or []
data = _as_dict(employee.current_data)
contacts = _as_dict(data.get("contacts"))
sections = _as_list(data.get("sections"))
positions = _clean_list(data.get("positions"))
emails = _clean_list(contacts.get("emails"))
phones = _clean_list(contacts.get("phones"))
return {
"id": employee.id,
"full_name": employee.full_name,
"status": employee.status,
"status_display": _employee_status_display(employee.status),
"canonical_url": employee.canonical_url,
"positions": data.get("positions") or [],
"positions_text": "; ".join(data.get("positions") or []),
"positions": positions,
"positions_text": "; ".join(positions),
"hse_start_year": data.get("hse_start_year"),
"emails": emails,
"email_text": ", ".join(emails),
@@ -43,6 +46,28 @@ def employee_display_payload(employee: Employee) -> dict[str, Any]:
"first_seen_at": employee.first_seen_at.isoformat() if employee.first_seen_at else None,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
"first_seen_display": format_admin_datetime(employee.first_seen_at),
"last_seen_display": format_admin_datetime(employee.last_seen_at),
"dismissed_display": format_admin_datetime(employee.dismissed_at),
}
def employee_detail_payload(employee: Employee) -> dict[str, Any]:
data = _as_dict(employee.current_data)
contacts = _as_dict(data.get("contacts"))
return {
**employee_display_payload(employee),
"profile_type": employee.profile_type or data.get("profile_type"),
"profile_id": employee.profile_id or data.get("profile_id"),
"parser_version": employee.parser_version or data.get("parser_version"),
"contacts": {
"emails": _clean_list(contacts.get("emails")),
"phones": _clean_list(contacts.get("phones")),
"address": contacts.get("address"),
"contact_items": _normalize_contact_items(contacts.get("items")),
},
"external_ids": _normalize_external_ids(data.get("external_ids")),
"sections": [_normalize_section(section) for section in _as_list(data.get("sections"))],
}
@@ -87,7 +112,7 @@ def list_employees_page(
limit: int = 50,
offset: int = 0,
) -> dict[str, Any]:
limit = max(1, min(limit, 200))
limit = limit if limit in {25, 50, 100} else 50
offset = max(0, offset)
base_stmt = build_employee_query(
status=status,
@@ -101,7 +126,7 @@ def list_employees_page(
order = desc(sort_column) if direction == "desc" else sort_column
employees = db.scalars(base_stmt.order_by(order).limit(limit).offset(offset)).all()
return {
"items": [employee_display_payload(employee) for employee in employees],
"employees": [employee_display_payload(employee) for employee in employees],
"total": total,
"limit": limit,
"offset": offset,
@@ -134,8 +159,11 @@ def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
"id": run.id,
"source_url": run.source_url,
"status": run.status,
"status_display": _run_status_display(run.status),
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"started_display": format_admin_datetime(run.started_at),
"finished_display": format_admin_datetime(run.finished_at),
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"new_count": run.new_count,
@@ -147,6 +175,31 @@ def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
}
def format_admin_datetime(value: Any) -> str:
if not value:
return "Не указано"
if isinstance(value, str):
try:
value = datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return value
if not isinstance(value, datetime):
return str(value)
if value.tzinfo:
value = value.astimezone(ZoneInfo("Europe/Moscow"))
return value.strftime("%d.%m.%Y %H:%M")
def _employee_status_display(status: str | None) -> str:
labels = {"active": "Работает", "dismissed": "Уволен"}
return labels.get(status or "", status or "Не указано")
def _run_status_display(status: str | None) -> str:
labels = {"running": "Выполняется", "completed": "Завершен", "failed": "Ошибка"}
return labels.get(status or "", status or "Не указано")
def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
total = 0
for section in sections:
@@ -157,3 +210,187 @@ def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> i
elif section_type == "courses_by_year":
total += len(section.get("courses") or [])
return total
def _clean_list(values: Any) -> list[str]:
if values is None:
return []
if not isinstance(values, list):
values = [values]
return [str(value).strip() for value in values if str(value or "").strip()]
def _as_dict(value: Any) -> dict[str, Any]:
return value if isinstance(value, dict) else {}
def _as_list(value: Any) -> list[Any]:
if value is None:
return []
return value if isinstance(value, list) else [value]
def _normalize_contact_items(items: Any) -> list[str]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if isinstance(item, dict):
value = item.get("raw") or item.get("value") or item.get("text")
else:
value = item
value = str(value or "").strip()
if value:
normalized.append(value)
return normalized
def _normalize_external_ids(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
system = str(item.get("system") or "").strip()
value = str(item.get("value") or "").strip()
url = str(item.get("url") or "").strip()
if system or value or url:
normalized.append({"system": system or "ID", "value": value or url, "url": url or None})
return normalized
def _normalize_section(section: Any) -> dict[str, Any]:
if not isinstance(section, dict):
return {"title": "Раздел", "type": "generic", "paragraphs": [str(section)], "items": [], "links": []}
section_type = section.get("type") or "generic"
paragraphs = _clean_list(section.get("paragraphs"))
items = _clean_list(section.get("items"))
raw_text = str(section.get("raw_text") or "").strip()
if not paragraphs and not items and raw_text:
paragraphs = [raw_text]
return {
"title": section.get("title") or "Раздел",
"type": section_type,
"raw_text": raw_text,
"paragraphs": paragraphs,
"list_items": items,
"links": _normalize_links(section.get("links")),
"year_entries": _normalize_year_entries(section.get("year_entries")),
"publications": _normalize_publications(section.get("publications")),
"publications_count": section.get("publications_count"),
"theses": _normalize_theses(section.get("theses")),
"theses_count": section.get("theses_count"),
"academic_year": section.get("academic_year"),
"courses": _normalize_courses(section.get("courses")),
"table": _normalize_table(section.get("table")),
}
def _normalize_links(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
text = str(item.get("text") or item.get("url") or "").strip()
url = str(item.get("url") or "").strip()
if text and url:
normalized.append({"text": text, "url": url})
return normalized
def _normalize_year_entries(items: Any) -> list[dict[str, Any]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
text = str(item.get("text") or "").strip()
if text:
normalized.append({"year": item.get("year"), "text": text, "links": _normalize_links(item.get("links"))})
return normalized
def _normalize_publications(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
text = str(item or "").strip()
if text:
normalized.append({"title": text, "text": text, "url": None})
continue
title = str(item.get("title") or "").strip()
text = str(item.get("text") or title).strip()
url = str(item.get("url") or "").strip()
if title or text:
normalized.append({"title": title or text, "text": text or title, "url": url or None})
return normalized
def _normalize_courses(items: Any) -> list[dict[str, str | None]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
title = str(item or "").strip()
if title:
normalized.append({"title": title, "url": None})
continue
title = str(item.get("title") or "").strip()
url = str(item.get("url") or "").strip()
if title or url:
normalized.append({"title": title or url, "url": url or None})
return normalized
def _normalize_theses(items: Any) -> list[dict[str, Any]]:
normalized = []
if not isinstance(items, list):
return normalized
for item in items:
if not isinstance(item, dict):
continue
title = str(item.get("title") or "").strip()
student = str(item.get("student") or "").strip()
if not title and not student:
continue
normalized.append(
{
"id": item.get("id"),
"student": student,
"title": title,
"defense_year": item.get("defense_year") or item.get("year"),
"level": str(item.get("level") or "").strip(),
"rating": item.get("rating"),
"project_url": str(item.get("project_url") or "").strip() or None,
"program": str(item.get("program") or "").strip(),
"program_url": str(item.get("program_url") or "").strip() or None,
"org_unit": str(item.get("org_unit") or "").strip(),
"org_unit_url": str(item.get("org_unit_url") or "").strip() or None,
}
)
return normalized
def _normalize_table(table: Any) -> dict[str, Any] | None:
if not isinstance(table, dict):
return None
headers = _clean_list(table.get("headers"))
rows = []
for row in table.get("rows") or []:
if not isinstance(row, dict):
continue
cells = _clean_list(row.get("cells"))
if cells:
rows.append({"cells": cells, "link_url": row.get("link_url")})
if not headers and not rows:
return None
return {"headers": headers, "rows": rows}

View File

@@ -152,6 +152,177 @@
white-space: pre-wrap;
}
.employee-card {
display: grid;
gap: 18px;
}
.employee-card__header {
display: flex;
align-items: flex-start;
justify-content: space-between;
gap: 18px;
padding: 22px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.employee-card__identity {
display: grid;
gap: 10px;
}
.employee-card__title {
margin: 0;
font-size: 24px;
}
.employee-card__section {
padding: 20px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.employee-card__meta {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
gap: 14px;
margin: 0;
}
.employee-card__meta-item {
min-width: 0;
}
.employee-card__meta-item--wide {
grid-column: 1 / -1;
}
.employee-card__meta-label {
margin-bottom: 5px;
color: #6b7280;
font-size: 12px;
font-weight: 700;
text-transform: uppercase;
}
.employee-card__meta-value {
margin: 0;
color: #1f2937;
line-height: 1.45;
}
.employee-card__list {
display: grid;
gap: 8px;
margin: 0;
padding-left: 18px;
}
.employee-card__list-item {
line-height: 1.45;
}
.employee-card__sections {
display: grid;
gap: 14px;
}
.employee-section {
padding: 16px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 8px;
}
.employee-section__header {
display: flex;
align-items: flex-start;
justify-content: space-between;
gap: 12px;
margin-bottom: 12px;
}
.employee-section__title {
margin: 0;
font-size: 17px;
}
.employee-section__type {
flex: 0 0 auto;
padding: 3px 8px;
color: #475569;
background: #e2e8f0;
border-radius: 999px;
font-size: 12px;
}
.employee-section__note {
margin: 0 0 10px;
color: #4b5563;
font-weight: 700;
}
.employee-section__text {
margin: 0 0 10px;
line-height: 1.55;
}
.employee-section__meta {
display: flex;
flex-wrap: wrap;
gap: 8px 12px;
color: #4b5563;
font-size: 13px;
}
.employee-section__meta-item {
line-height: 1.4;
}
.employee-section__table-wrap {
overflow-x: auto;
}
.employee-section__table {
width: 100%;
border-collapse: collapse;
background: #ffffff;
}
.employee-section__head,
.employee-section__cell {
padding: 10px;
border-bottom: 1px solid #e5e7eb;
text-align: left;
vertical-align: top;
}
.employee-section__head {
color: #374151;
background: #f3f4f6;
font-size: 13px;
}
.employee-section__links {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 12px;
}
.employee-section__link {
padding: 5px 9px;
color: #0f766e;
background: #ccfbf1;
border-radius: 999px;
font-size: 12px;
font-weight: 700;
text-decoration: none;
}
.stats-strip {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
@@ -399,7 +570,8 @@
}
.progress-panel__header,
.directory__header {
.directory__header,
.employee-card__header {
align-items: stretch;
flex-direction: column;
}

View File

@@ -79,7 +79,7 @@
const errors = document.querySelector("[data-progress-errors]");
const fill = document.querySelector("[data-progress-fill]");
const percent = document.querySelector("[data-progress-percent]");
if (status) status.textContent = run.status;
if (status) status.textContent = run.status_display || run.status;
if (processed) processed.textContent = run.processed_count;
if (found) found.textContent = run.found_count;
if (errors) errors.textContent = run.error_count;

View File

@@ -10,12 +10,11 @@
<header class="admin__header">
<h1 class="admin__brand">MIEM Employees</h1>
<nav class="admin__nav">
<a class="admin__link" href="/admin">Dashboard</a>
<a class="admin__link" href="/admin/directory">Directory</a>
<a class="admin__link" href="/admin/employees">Employees</a>
<a class="admin__link" href="/admin/runs">Runs</a>
<a class="admin__link" href="/admin">Обзор</a>
<a class="admin__link" href="/admin/directory">Сотрудники</a>
<a class="admin__link" href="/admin/runs">Запуски</a>
<form method="post" action="/admin/logout">
<button class="button button--ghost" type="submit">Logout</button>
<button class="button button--ghost" type="submit">Выйти</button>
</form>
</nav>
</header>

View File

@@ -1,43 +1,43 @@
{% extends "base.html" %}
{% block title %}Dashboard · MIEM Employees{% endblock %}
{% block title %}Обзор · MIEM Employees{% endblock %}
{% block content %}
<section class="admin__grid">
<div class="metric"><div class="metric__label">Total</div><div class="metric__value">{{ counts.total }}</div></div>
<div class="metric"><div class="metric__label">Active</div><div class="metric__value">{{ counts.active }}</div></div>
<div class="metric"><div class="metric__label">New in last run</div><div class="metric__value">{{ counts.new_in_last_run }}</div></div>
<div class="metric"><div class="metric__label">Dismissed</div><div class="metric__value">{{ counts.dismissed }}</div></div>
<div class="metric"><div class="metric__label">Всего в базе</div><div class="metric__value">{{ counts.total }}</div></div>
<div class="metric"><div class="metric__label">Работают</div><div class="metric__value">{{ counts.active }}</div></div>
<div class="metric"><div class="metric__label">Новые за запуск</div><div class="metric__value">{{ counts.new_in_last_run }}</div></div>
<div class="metric"><div class="metric__label">Уволены</div><div class="metric__value">{{ counts.dismissed }}</div></div>
</section>
<section class="stats-strip">
<div class="stats-strip__item">
<span class="stats-strip__label">Latest added</span>
<span class="stats-strip__label">Последний добавленный</span>
{% if counts.latest_added %}
<a class="stats-strip__value" href="/admin/employees/{{ counts.latest_added.id }}">{{ counts.latest_added.full_name or counts.latest_added.canonical_url }}</a>
{% else %}
<span class="stats-strip__value">No employees yet</span>
<span class="stats-strip__value">Сотрудников пока нет</span>
{% endif %}
</div>
<div class="stats-strip__item">
<span class="stats-strip__label">Runs</span>
<span class="stats-strip__label">Запуски</span>
<span class="stats-strip__value">{{ counts.runs }}</span>
</div>
<div class="stats-strip__item">
<span class="stats-strip__label">Errors</span>
<span class="stats-strip__label">Ошибки</span>
<span class="stats-strip__value">{{ counts.errors }}</span>
</div>
</section>
<section class="panel progress-panel" data-progress-panel>
<div class="progress-panel__header">
<h2 class="panel__title">Parsing progress</h2>
<h2 class="panel__title">Прогресс парсинга</h2>
<form method="post" action="/admin/crawl-now">
<button class="button" type="submit">Start crawl now</button>
<button class="button" type="submit">Запустить парсинг</button>
</form>
</div>
{% set run = counts.current_running_run or latest_run %}
<div class="progress-panel__body" data-progress-body>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status if run else "idle" }}</span>
<span><span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span> processed</span>
<span><span data-progress-errors>{{ run.error_count if run else 0 }}</span> errors</span>
<span data-progress-status>{{ run.status_display if run else "Ожидание" }}</span>
<span>обработано: <span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span></span>
<span>ошибок: <span data-progress-errors>{{ run.error_count if run else 0 }}</span></span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ run.progress_percent if run else 0 }}%"></div>
@@ -46,12 +46,12 @@
</div>
</section>
<section class="panel">
<h2 class="panel__title">Latest runs</h2>
<h2 class="panel__title">Последние запуски</h2>
<table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Parsed</th><th class="table__head">Errors</th><th class="table__head">Started</th></tr></thead>
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Обработано</th><th class="table__head">Ошибки</th><th class="table__head">Старт</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_at }}</td></tr>
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -1,65 +1,71 @@
{% extends "base.html" %}
{% block title %}Directory · MIEM Employees{% endblock %}
{% block title %}Сотрудники · MIEM Employees{% endblock %}
{% block content %}
<section class="directory">
<div class="directory__header">
<div>
<h2 class="directory__title">Directory</h2>
<p class="directory__summary">{{ page.total }} employees found</p>
<h2 class="directory__title">Сотрудники</h2>
<p class="directory__summary">Найдено: {{ page.total }}</p>
</div>
<button class="button" type="button" data-columns-open>Columns</button>
<button class="button" type="button" data-columns-open>Колонки</button>
</div>
<form class="directory__filters" method="get" action="/admin/directory">
<input class="directory__input" name="q" value="{{ filters.q }}" placeholder="Name or URL">
<input class="directory__input" name="q" value="{{ filters.q }}" placeholder="ФИО или ссылка">
<select class="directory__input" name="status">
<option value="" {% if not filters.status %}selected{% endif %}>All statuses</option>
<option value="active" {% if filters.status == "active" %}selected{% endif %}>Active</option>
<option value="dismissed" {% if filters.status == "dismissed" %}selected{% endif %}>Dismissed</option>
<option value="" {% if not filters.status %}selected{% endif %}>Все статусы</option>
<option value="active" {% if filters.status == "active" %}selected{% endif %}>Работает</option>
<option value="dismissed" {% if filters.status == "dismissed" %}selected{% endif %}>Уволен</option>
</select>
<select class="directory__input" name="has_email">
<option value="" {% if not filters.has_email %}selected{% endif %}>Any email</option>
<option value="true" {% if filters.has_email == "true" %}selected{% endif %}>Has email</option>
<option value="false" {% if filters.has_email == "false" %}selected{% endif %}>No email</option>
<option value="" {% if not filters.has_email %}selected{% endif %}>Любой email</option>
<option value="true" {% if filters.has_email == "true" %}selected{% endif %}>Есть email</option>
<option value="false" {% if filters.has_email == "false" %}selected{% endif %}>Нет email</option>
</select>
<input class="directory__input" type="date" name="started_from" value="{{ filters.started_from }}" aria-label="First seen from">
<input class="directory__input" type="date" name="started_to" value="{{ filters.started_to }}" aria-label="First seen to">
<input class="directory__input" type="date" name="started_from" value="{{ filters.started_from }}" aria-label="Впервые найден с">
<input class="directory__input" type="date" name="started_to" value="{{ filters.started_to }}" aria-label="Впервые найден по">
<select class="directory__input" name="sort">
{% for value, label in [("full_name", "Name"), ("status", "Status"), ("hse_start_year", "HSE start"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed")] %}
<option value="{{ value }}" {% if filters.sort == value %}selected{% endif %}>Sort: {{ label }}</option>
{% for value, label in [("full_name", "ФИО"), ("status", "Статус"), ("hse_start_year", "Год начала"), ("first_seen_at", "Впервые найден"), ("last_seen_at", "Последний раз найден"), ("dismissed_at", "Дата увольнения")] %}
<option value="{{ value }}" {% if filters.sort == value %}selected{% endif %}>Сортировка: {{ label }}</option>
{% endfor %}
</select>
<select class="directory__input" name="direction">
<option value="asc" {% if filters.direction == "asc" %}selected{% endif %}>Ascending</option>
<option value="desc" {% if filters.direction == "desc" %}selected{% endif %}>Descending</option>
<option value="asc" {% if filters.direction == "asc" %}selected{% endif %}>По возрастанию</option>
<option value="desc" {% if filters.direction == "desc" %}selected{% endif %}>По убыванию</option>
</select>
<button class="button" type="submit">Apply</button>
<select class="directory__input" name="limit" onchange="this.form.offset.value = 0; this.form.submit()">
{% for value in [25, 50, 100] %}
<option value="{{ value }}" {% if filters.limit == value %}selected{% endif %}>На странице: {{ value }}</option>
{% endfor %}
</select>
<input type="hidden" name="offset" value="{{ filters.offset }}">
<button class="button" type="submit">Применить</button>
</form>
<div class="directory__table-wrap">
<table class="directory-table" data-directory-table>
<thead>
<tr>
<th class="directory-table__head" data-column="full_name">Name</th>
<th class="directory-table__head" data-column="status">Status</th>
<th class="directory-table__head" data-column="positions">Positions</th>
<th class="directory-table__head" data-column="hse_start_year">HSE start</th>
<th class="directory-table__head" data-column="full_name">ФИО</th>
<th class="directory-table__head" data-column="status">Статус</th>
<th class="directory-table__head" data-column="positions">Должности</th>
<th class="directory-table__head" data-column="hse_start_year">Год начала</th>
<th class="directory-table__head" data-column="email">Email</th>
<th class="directory-table__head" data-column="phone">Phone</th>
<th class="directory-table__head" data-column="address">Address</th>
<th class="directory-table__head" data-column="publications_count">Publications</th>
<th class="directory-table__head" data-column="courses_count">Courses</th>
<th class="directory-table__head" data-column="first_seen_at">First seen</th>
<th class="directory-table__head" data-column="last_seen_at">Last seen</th>
<th class="directory-table__head" data-column="dismissed_at">Dismissed</th>
<th class="directory-table__head" data-column="profile">Profile</th>
<th class="directory-table__head" data-column="phone">Телефон</th>
<th class="directory-table__head" data-column="address">Адрес</th>
<th class="directory-table__head" data-column="publications_count">Публикации</th>
<th class="directory-table__head" data-column="courses_count">Курсы</th>
<th class="directory-table__head" data-column="first_seen_at">Впервые найден</th>
<th class="directory-table__head" data-column="last_seen_at">Последний раз найден</th>
<th class="directory-table__head" data-column="dismissed_at">Дата увольнения</th>
<th class="directory-table__head" data-column="profile">Профиль</th>
</tr>
</thead>
<tbody>
{% for employee in page.items %}
{% for employee in page.employees %}
<tr class="directory-table__row" data-row-href="/admin/employees/{{ employee.id }}">
<td class="directory-table__cell" data-column="full_name">{{ employee.full_name or "No name" }}</td>
<td class="directory-table__cell" data-column="status"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
<td class="directory-table__cell" data-column="full_name">{{ employee.full_name or "Без имени" }}</td>
<td class="directory-table__cell" data-column="status"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status_display }}</span></td>
<td class="directory-table__cell" data-column="positions">{{ employee.positions_text }}</td>
<td class="directory-table__cell" data-column="hse_start_year">{{ employee.hse_start_year or "" }}</td>
<td class="directory-table__cell" data-column="email">{{ employee.email_text }}</td>
@@ -67,13 +73,13 @@
<td class="directory-table__cell" data-column="address">{{ employee.address or "" }}</td>
<td class="directory-table__cell" data-column="publications_count">{{ employee.publications_count }}</td>
<td class="directory-table__cell" data-column="courses_count">{{ employee.courses_count }}</td>
<td class="directory-table__cell" data-column="first_seen_at">{{ employee.first_seen_at or "" }}</td>
<td class="directory-table__cell" data-column="last_seen_at">{{ employee.last_seen_at or "" }}</td>
<td class="directory-table__cell" data-column="dismissed_at">{{ employee.dismissed_at or "" }}</td>
<td class="directory-table__cell" data-column="profile"><a class="admin__link" href="{{ employee.canonical_url }}">Open</a></td>
<td class="directory-table__cell" data-column="first_seen_at">{{ employee.first_seen_display }}</td>
<td class="directory-table__cell" data-column="last_seen_at">{{ employee.last_seen_display }}</td>
<td class="directory-table__cell" data-column="dismissed_at">{{ employee.dismissed_display }}</td>
<td class="directory-table__cell" data-column="profile"><a class="admin__link" href="{{ employee.canonical_url }}">Открыть</a></td>
</tr>
{% else %}
<tr><td class="directory-table__empty" colspan="13">No employees match these filters.</td></tr>
<tr><td class="directory-table__empty" colspan="13">По этим фильтрам сотрудники не найдены.</td></tr>
{% endfor %}
</tbody>
</table>
@@ -83,24 +89,24 @@
{% set prev_offset = filters.offset - filters.limit %}
{% set next_offset = filters.offset + filters.limit %}
{% if filters.offset > 0 %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=prev_offset) }}">Previous</a>
<a class="admin__link" href="{{ request.url.include_query_params(offset=prev_offset) }}">Назад</a>
{% endif %}
<span class="directory__page">Page {{ page.page }}{% if page.pages %} of {{ page.pages }}{% endif %}</span>
<span class="directory__page">Страница {{ page.page }}{% if page.pages %} из {{ page.pages }}{% endif %}</span>
{% if next_offset < page.total %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=next_offset) }}">Next</a>
<a class="admin__link" href="{{ request.url.include_query_params(offset=next_offset) }}">Вперед</a>
{% endif %}
</div>
</section>
<div class="columns-modal" data-columns-modal hidden>
<div class="columns-modal__backdrop" data-columns-close></div>
<section class="columns-modal__panel" aria-label="Column settings">
<section class="columns-modal__panel" aria-label="Настройка колонок">
<div class="columns-modal__header">
<h3 class="columns-modal__title">Visible columns</h3>
<button class="button button--ghost" type="button" data-columns-close>Close</button>
<h3 class="columns-modal__title">Отображаемые колонки</h3>
<button class="button button--ghost" type="button" data-columns-close>Закрыть</button>
</div>
<div class="columns-modal__grid">
{% for key, label in [("full_name", "Name"), ("status", "Status"), ("positions", "Positions"), ("hse_start_year", "HSE start"), ("email", "Email"), ("phone", "Phone"), ("address", "Address"), ("publications_count", "Publications"), ("courses_count", "Courses"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed"), ("profile", "Profile")] %}
{% for key, label in [("full_name", "ФИО"), ("status", "Статус"), ("positions", "Должности"), ("hse_start_year", "Год начала"), ("email", "Email"), ("phone", "Телефон"), ("address", "Адрес"), ("publications_count", "Публикации"), ("courses_count", "Курсы"), ("first_seen_at", "Впервые найден"), ("last_seen_at", "Последний раз найден"), ("dismissed_at", "Дата увольнения"), ("profile", "Профиль")] %}
<label class="columns-modal__option"><input class="columns-modal__checkbox" type="checkbox" value="{{ key }}" data-column-toggle> {{ label }}</label>
{% endfor %}
</div>

View File

@@ -1,26 +1,226 @@
{% extends "base.html" %}
{% block title %}{{ employee.full_name }} · MIEM Employees{% endblock %}
{% block title %}{{ employee_view.full_name }} · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">{{ employee.full_name or employee.profile_key }}</h2>
<p><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></p>
<p><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></p>
<h3>Tabs</h3>
<ul>
{% for tab in employee.tabs %}
<li><a class="admin__link" href="{{ tab.href }}">{{ tab.title }}</a></li>
{% endfor %}
</ul>
<h3>Current data</h3>
<pre class="code">{{ employee.current_data | tojson(indent=2) }}</pre>
<section class="employee-card">
<div class="employee-card__header">
<div class="employee-card__identity">
<h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2>
<span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status_display }}</span>
</div>
<a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a>
</div>
<section class="employee-card__section">
<h3 class="employee-section__title">Основная информация</h3>
<dl class="employee-card__meta">
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Должности</dt>
<dd class="employee-card__meta-value">
{% if employee_view.positions %}
<ul class="employee-card__list">
{% for position in employee_view.positions %}
<li class="employee-card__list-item">{{ position }}</li>
{% endfor %}
</ul>
{% else %}
Не указано
{% endif %}
</dd>
</div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Год начала работы в ВШЭ</dt><dd class="employee-card__meta-value">{{ employee_view.hse_start_year or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Тип профиля</dt><dd class="employee-card__meta-value">{{ employee_view.profile_type or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">ID профиля</dt><dd class="employee-card__meta-value">{{ employee_view.profile_id or "Не указано" }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Впервые найден</dt><dd class="employee-card__meta-value">{{ employee_view.first_seen_display }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Последний раз найден</dt><dd class="employee-card__meta-value">{{ employee_view.last_seen_display }}</dd></div>
<div class="employee-card__meta-item"><dt class="employee-card__meta-label">Дата увольнения</dt><dd class="employee-card__meta-value">{{ employee_view.dismissed_display }}</dd></div>
</dl>
</section>
<section class="employee-card__section">
<h3 class="employee-section__title">Контакты</h3>
<dl class="employee-card__meta">
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Email</dt>
<dd class="employee-card__meta-value">
{% if employee_view.contacts.emails %}
<ul class="employee-card__list">
{% for email in employee_view.contacts.emails %}
<li class="employee-card__list-item"><a class="admin__link" href="mailto:{{ email }}">{{ email }}</a></li>
{% endfor %}
</ul>
{% else %}
Не указано
{% endif %}
</dd>
</div>
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Телефоны</dt>
<dd class="employee-card__meta-value">{{ employee_view.contacts.phones | join(", ") if employee_view.contacts.phones else "Не указано" }}</dd>
</div>
<div class="employee-card__meta-item">
<dt class="employee-card__meta-label">Адрес</dt>
<dd class="employee-card__meta-value">{{ employee_view.contacts.address or "Не указано" }}</dd>
</div>
{% if employee_view.contacts.contact_items %}
<div class="employee-card__meta-item employee-card__meta-item--wide">
<dt class="employee-card__meta-label">Прочее</dt>
<dd class="employee-card__meta-value">
<ul class="employee-card__list">
{% for item in employee_view.contacts.contact_items %}
<li class="employee-card__list-item">{{ item }}</li>
{% endfor %}
</ul>
</dd>
</div>
{% endif %}
</dl>
</section>
{% if employee_view.external_ids %}
<section class="employee-card__section">
<h3 class="employee-section__title">Внешние идентификаторы</h3>
<ul class="employee-card__list">
{% for external_id in employee_view.external_ids %}
<li class="employee-card__list-item">
<strong>{{ external_id.system }}:</strong>
{% if external_id.url %}
<a class="admin__link" href="{{ external_id.url }}">{{ external_id.value }}</a>
{% else %}
{{ external_id.value }}
{% endif %}
</li>
{% endfor %}
</ul>
</section>
{% endif %}
<section class="employee-card__section">
<h3 class="employee-section__title">Разделы профиля</h3>
{% if employee_view.sections %}
<div class="employee-card__sections">
{% for section in employee_view.sections %}
<article class="employee-section">
<div class="employee-section__header">
<h4 class="employee-section__title">{{ section.title }}</h4>
<span class="employee-section__type">{{ section.type }}</span>
</div>
{% if section.type == "year_blocks" and section.year_entries %}
<ul class="employee-card__list">
{% for entry in section.year_entries %}
<li class="employee-card__list-item">{% if entry.year %}<strong>{{ entry.year }}:</strong> {% endif %}{{ entry.text }}</li>
{% endfor %}
</ul>
{% elif section.type == "publications" and section.publications %}
{% if section.publications_count %}<p class="employee-section__note">Всего: {{ section.publications_count }}</p>{% endif %}
<ul class="employee-card__list">
{% for publication in section.publications %}
<li class="employee-card__list-item">
{% if publication.url %}
<a class="admin__link" href="{{ publication.url }}">{{ publication.title }}</a>
{% else %}
{{ publication.title }}
{% endif %}
{% if publication.text and publication.text != publication.title %}<div class="employee-section__text">{{ publication.text }}</div>{% endif %}
</li>
{% endfor %}
</ul>
{% elif section.type == "courses_by_year" and section.courses %}
{% if section.academic_year %}<p class="employee-section__note">Учебный год: {{ section.academic_year }}</p>{% endif %}
<ul class="employee-card__list">
{% for course in section.courses %}
<li class="employee-card__list-item">
{% if course.url %}
<a class="admin__link" href="{{ course.url }}">{{ course.title }}</a>
{% else %}
{{ course.title }}
{% endif %}
</li>
{% endfor %}
</ul>
{% elif section.type == "graduation_theses" and section.theses %}
{% if section.theses_count %}<p class="employee-section__note">Всего: {{ section.theses_count }}</p>{% endif %}
<ul class="employee-card__list">
{% for thesis in section.theses %}
<li class="employee-card__list-item">
{% if thesis.student %}<strong>{{ thesis.student }}</strong>{% endif %}
{% if thesis.title %}
<div class="employee-section__text">
{% if thesis.project_url %}
<a class="admin__link" href="{{ thesis.project_url }}">{{ thesis.title }}</a>
{% else %}
{{ thesis.title }}
{% endif %}
</div>
{% endif %}
<div class="employee-section__meta">
{% if thesis.defense_year %}<span class="employee-section__meta-item">Год защиты: {{ thesis.defense_year }}</span>{% endif %}
{% if thesis.level %}<span class="employee-section__meta-item">{{ thesis.level }}</span>{% endif %}
{% if thesis.rating is not none %}<span class="employee-section__meta-item">Оценка: {{ thesis.rating }}</span>{% endif %}
{% if thesis.program %}
<span class="employee-section__meta-item">
{% if thesis.program_url %}<a class="admin__link" href="{{ thesis.program_url }}">{{ thesis.program }}</a>{% else %}{{ thesis.program }}{% endif %}
</span>
{% endif %}
</div>
</li>
{% endfor %}
</ul>
{% elif section.type == "table" and section.table %}
<div class="employee-section__table-wrap">
<table class="employee-section__table">
{% if section.table.headers %}
<thead><tr>{% for header in section.table.headers %}<th class="employee-section__head">{{ header }}</th>{% endfor %}</tr></thead>
{% endif %}
<tbody>
{% for row in section.table.rows %}
<tr>
{% for cell in row.cells %}
<td class="employee-section__cell">{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
{% if section.paragraphs %}
{% for paragraph in section.paragraphs %}
<p class="employee-section__text">{{ paragraph }}</p>
{% endfor %}
{% endif %}
{% if section.list_items %}
<ul class="employee-card__list">
{% for item in section.list_items %}
<li class="employee-card__list-item">{{ item }}</li>
{% endfor %}
</ul>
{% endif %}
{% endif %}
{% if section.links and section.type not in ["courses_by_year", "graduation_theses"] %}
<div class="employee-section__links">
{% for link in section.links %}
<a class="employee-section__link" href="{{ link.url }}">{{ link.text }}</a>
{% endfor %}
</div>
{% endif %}
</article>
{% endfor %}
</div>
{% else %}
<p class="employee-section__text">Разделы профиля не найдены.</p>
{% endif %}
</section>
</section>
<section class="panel">
<h2 class="panel__title">Snapshots</h2>
<h2 class="panel__title">Снапшоты</h2>
<table class="table">
<thead><tr><th class="table__head">Captured</th><th class="table__head">Checksum</th><th class="table__head">Parser</th></tr></thead>
<thead><tr><th class="table__head">Дата</th><th class="table__head">Checksum</th><th class="table__head">Парсер</th></tr></thead>
<tbody>
{% for snapshot in snapshots %}
<tr><td class="table__cell">{{ snapshot.captured_at }}</td><td class="table__cell">{{ snapshot.checksum }}</td><td class="table__cell">{{ snapshot.parser_version }}</td></tr>
<tr><td class="table__cell">{{ snapshot.captured_display }}</td><td class="table__cell">{{ snapshot.checksum }}</td><td class="table__cell">{{ snapshot.parser_version }}</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -1,29 +0,0 @@
{% extends "base.html" %}
{% block title %}Employees · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">Employees</h2>
<form class="form" method="get" action="/admin/employees">
<input class="form__input" name="q" value="{{ q }}" placeholder="Name or URL">
<select class="form__select" name="status">
<option value="" {% if not status %}selected{% endif %}>All</option>
<option value="active" {% if status == "active" %}selected{% endif %}>Active</option>
<option value="dismissed" {% if status == "dismissed" %}selected{% endif %}>Dismissed</option>
</select>
<button class="button" type="submit">Search</button>
</form>
<table class="table">
<thead><tr><th class="table__head">Name</th><th class="table__head">Status</th><th class="table__head">Last seen</th><th class="table__head">Profile</th></tr></thead>
<tbody>
{% for employee in employees %}
<tr>
<td class="table__cell"><a class="admin__link" href="/admin/employees/{{ employee.id }}">{{ employee.full_name or employee.profile_key }}</a></td>
<td class="table__cell"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
<td class="table__cell">{{ employee.last_seen_at }}</td>
<td class="table__cell"><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}

View File

@@ -3,18 +3,18 @@
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Login · MIEM Employees</title>
<title>Вход · MIEM Employees</title>
<link rel="stylesheet" href="/static/admin.css">
</head>
<body class="admin">
<main class="admin__main">
<section class="panel">
<h1 class="panel__title">Admin login</h1>
<h1 class="panel__title">Вход в админку</h1>
{% if error %}<p>{{ error }}</p>{% endif %}
<form class="form" method="post" action="/admin/login">
<label class="form__label">Login <input class="form__input" name="username" autocomplete="username"></label>
<label class="form__label">Password <input class="form__input" name="password" type="password" autocomplete="current-password"></label>
<button class="button" type="submit">Sign in</button>
<label class="form__label">Логин <input class="form__input" name="username" autocomplete="username"></label>
<label class="form__label">Пароль <input class="form__input" name="password" type="password" autocomplete="current-password"></label>
<button class="button" type="submit">Войти</button>
</form>
</section>
</main>

View File

@@ -1,10 +1,10 @@
{% extends "base.html" %}
{% block title %}Runs · MIEM Employees{% endblock %}
{% block title %}Запуски · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<div class="progress-panel__header">
<h2 class="panel__title">Crawl runs</h2>
<form method="post" action="/admin/runs"><button class="button" type="submit">Start crawl now</button></form>
<h2 class="panel__title">Запуски парсинга</h2>
<form method="post" action="/admin/runs"><button class="button" type="submit">Запустить парсинг</button></form>
</div>
{% set run = runs[0] if runs else none %}
{% if run %}
@@ -12,9 +12,9 @@
{% set percent = ((processed / run.found_count) * 100) | round(1) if run.found_count else 0 %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status }}</span>
<span><span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span> processed</span>
<span><span data-progress-errors>{{ run.error_count }}</span> errors</span>
<span data-progress-status>{{ run.status_display }}</span>
<span>обработано: <span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span></span>
<span>ошибок: <span data-progress-errors>{{ run.error_count }}</span></span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ percent }}%"></div>
@@ -24,9 +24,9 @@
{% else %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>idle</span>
<span><span data-progress-processed>0</span> / <span data-progress-found>0</span> processed</span>
<span><span data-progress-errors>0</span> errors</span>
<span data-progress-status>Ожидание</span>
<span>обработано: <span data-progress-processed>0</span> / <span data-progress-found>0</span></span>
<span>ошибок: <span data-progress-errors>0</span></span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: 0%"></div>
@@ -35,18 +35,18 @@
</div>
{% endif %}
<table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Found</th><th class="table__head">Parsed</th><th class="table__head">New</th><th class="table__head">Errors</th><th class="table__head">Dismissed</th></tr></thead>
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Найдено</th><th class="table__head">Обработано</th><th class="table__head">Новые</th><th class="table__head">Ошибки</th><th class="table__head">Уволены</th><th class="table__head">Старт</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td></tr>
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
<section class="panel">
<h2 class="panel__title">Recent errors</h2>
<h2 class="panel__title">Последние ошибки</h2>
<table class="table">
<thead><tr><th class="table__head">Run</th><th class="table__head">Profile</th><th class="table__head">Error</th></tr></thead>
<thead><tr><th class="table__head">Запуск</th><th class="table__head">Профиль</th><th class="table__head">Ошибка</th></tr></thead>
<tbody>
{% for error in errors %}
<tr><td class="table__cell">{{ error.crawl_run_id }}</td><td class="table__cell">{{ error.profile_url }}</td><td class="table__cell">{{ error.error_type }}: {{ error.message }}</td></tr>

View File

@@ -1,3 +1,3 @@
APP_VERSION = "0.2.1"
FRONTEND_VERSION = "0.2.1"
BACKEND_VERSION = "0.2.1"
APP_VERSION = "0.2.8"
FRONTEND_VERSION = "0.2.8"
BACKEND_VERSION = "0.2.8"

View File

@@ -1,6 +1,6 @@
[project]
name = "miem-workers"
version = "0.1.0"
version = "0.2.8"
description = "MIEM employees parser, admin API, and MCP server"
requires-python = ">=3.11"
dependencies = [

View File

@@ -1,7 +1,22 @@
from datetime import datetime, timezone
from app.models import CrawlRun, Employee
from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
from app.services.admin_data import (
employee_detail_payload,
employee_display_payload,
format_admin_datetime,
list_employees_page,
run_payload,
stats_payload,
)
def test_format_admin_datetime_handles_datetime_string_and_none():
value = datetime(2026, 4, 28, 17, 13, 34, tzinfo=timezone.utc)
assert format_admin_datetime(value) == "28.04.2026 20:13"
assert format_admin_datetime("2026-04-28T17:13:34.448605+00:00") == "28.04.2026 20:13"
assert format_admin_datetime(None) == "Не указано"
def test_employee_display_payload_extracts_common_fields(db_session):
@@ -26,9 +41,103 @@ def test_employee_display_payload_extracts_common_fields(db_session):
payload = employee_display_payload(employee)
assert payload["positions_text"] == "Professor"
assert payload["status_display"] == "Работает"
assert payload["email_text"] == "person@hse.ru"
assert payload["publications_count"] == 1
assert payload["courses_count"] == 1
assert payload["first_seen_display"] != "Не указано"
def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
employee = Employee(
profile_key="staff:person",
profile_type="staff",
profile_id="person",
canonical_url="https://www.hse.ru/staff/person",
full_name="Person Name",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={
"positions": ["Professor"],
"hse_start_year": 2024,
"contacts": {
"emails": ["person@hse.ru"],
"phones": ["+79990000000"],
"address": "Moscow",
"items": [{"raw": "consultation hours"}],
},
"external_ids": [{"system": "ORCID", "value": "0000", "url": "https://orcid.org/0000"}],
"sections": [
{
"title": "Education",
"type": "year_blocks",
"year_entries": [{"year": 2020, "text": "Master degree"}],
},
{
"title": "Publications",
"type": "publications",
"publications": [{"title": "Paper", "text": "Paper details", "url": "https://example.test/paper"}],
},
{
"title": "Courses",
"type": "courses_by_year",
"academic_year": "2025/2026",
"courses": [{"title": "Course", "url": "https://example.test/course"}],
},
{
"title": "ВКР",
"type": "graduation_theses",
"theses_count": 1,
"theses": [
{
"student": "Student Name",
"title": "Thesis title",
"defense_year": 2025,
"project_url": "https://www.hse.ru/edu/vkr/1",
}
],
},
{
"title": "Fallback",
"type": "generic",
"raw_text": "Fallback text",
},
],
},
)
payload = employee_detail_payload(employee)
assert payload["contacts"]["emails"] == ["person@hse.ru"]
assert payload["contacts"]["contact_items"] == ["consultation hours"]
assert payload["external_ids"][0]["system"] == "ORCID"
assert payload["sections"][0]["year_entries"][0]["text"] == "Master degree"
assert payload["sections"][1]["publications"][0]["title"] == "Paper"
assert payload["sections"][2]["courses"][0]["title"] == "Course"
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
def test_employee_payloads_tolerate_malformed_current_data(db_session):
employee = Employee(
profile_key="staff:broken",
canonical_url="https://www.hse.ru/staff/broken",
full_name="Broken Data",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data="not-a-dict",
)
display = employee_display_payload(employee)
detail = employee_detail_payload(employee)
assert display["positions"] == []
assert display["email_text"] == ""
assert detail["contacts"]["emails"] == []
assert detail["contacts"]["contact_items"] == []
assert detail["sections"] == []
def test_list_employees_page_filters_sorts_and_paginates(db_session):
@@ -59,7 +168,8 @@ def test_list_employees_page_filters_sorts_and_paginates(db_session):
page = list_employees_page(db_session, status="active", sort="full_name", direction="asc", limit=10)
assert page["total"] == 1
assert page["items"][0]["full_name"] == "Alpha"
assert page["employees"][0]["full_name"] == "Alpha"
assert page["limit"] == 50
def test_stats_payload_uses_latest_run_new_count(db_session):
@@ -96,3 +206,4 @@ def test_run_payload_calculates_progress():
assert payload["processed_count"] == 5
assert payload["progress_percent"] == 50.0
assert payload["status_display"] == "Выполняется"

View File

@@ -0,0 +1,34 @@
from pathlib import Path
def test_base_navigation_is_russian_and_has_no_legacy_employees_link():
template = Path("app/templates/base.html").read_text(encoding="utf-8")
assert "Обзор" in template
assert "Сотрудники" in template
assert "Запуски" in template
assert "Выйти" in template
assert ">Employees<" not in template
assert "/admin/employees" not in template
def test_directory_template_is_russian_and_uses_display_dates():
template = Path("app/templates/directory.html").read_text(encoding="utf-8")
assert "Сотрудники" in template
assert "Колонки" in template
assert "Применить" in template
assert "На странице: {{ value }}" in template
assert "{% for value in [25, 50, 100] %}" in template
assert "Найдено:" in template
assert "employee.first_seen_display" in template
assert "employee.last_seen_display" in template
assert "employee.dismissed_display" in template
assert "Directory" not in template
assert "employees found" not in template
def test_admin_employees_route_redirects_to_directory():
source = Path("app/admin.py").read_text(encoding="utf-8")
assert 'RedirectResponse("/admin/directory", status_code=303)' in source

View File

@@ -18,7 +18,7 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.2.1"
assert response.json()["backend_version"] == "0.2.8"
def test_mcp_requires_token_and_lists_tools():

13
tests/test_config.py Normal file
View File

@@ -0,0 +1,13 @@
from app.config import Settings
def test_empty_crawl_limit_is_treated_as_none():
settings = Settings(crawl_limit="")
assert settings.crawl_limit is None
def test_numeric_crawl_limit_is_parsed():
settings = Settings(crawl_limit="25")
assert settings.crawl_limit == 25

View File

@@ -0,0 +1,30 @@
from pathlib import Path
def test_employee_detail_template_is_human_readable():
template = Path("app/templates/employee_detail.html").read_text(encoding="utf-8")
assert "Current data" not in template
assert "<pre class=\"code\"" not in template
assert ">Tabs<" not in template
assert "contacts.items" not in template
assert "contacts.contact_items" in template
assert "section.items" not in template
assert "section.list_items" in template
assert "Основная информация" in template
assert "Контакты" in template
assert "Разделы профиля" in template
assert "graduation_theses" in template
assert "Год защиты" in template
assert "Parser version" not in template
assert "First seen" not in template
assert "Last seen" not in template
assert "Dismissed at" not in template
assert "Profile type" not in template
assert "Profile ID" not in template
assert "Впервые найден" in template
assert "Последний раз найден" in template
assert "Дата увольнения" in template
assert "Тип профиля" in template
assert "ID профиля" in template
assert "Снапшоты" in template

View File

@@ -1,9 +1,69 @@
from bs4 import BeautifulSoup
from app.parser.profile import extract_person_tabs
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
class FakeResponse:
def __init__(self, payload):
self.payload = payload
def raise_for_status(self):
return None
def json(self):
return self.payload
class FakeSession:
def __init__(self):
self.posts = []
self.gets = []
def post(self, url, **kwargs):
self.posts.append((url, kwargs))
return FakeResponse(
{
"status": "ok",
"result": {
"more": False,
"total": 1,
"items": [
{
"id": "888959076",
"type": "ARTICLE",
"title": "Дублирование пакетов",
"year": 2023,
"description": {"short": {"ru": "Информационные процессы. 2023."}},
}
],
},
}
)
def get(self, url, **kwargs):
self.gets.append((url, kwargs))
return FakeResponse(
{
"lang": "ru",
"success": True,
"data": [
{
"id": 1045750164,
"year": 2025,
"level": "Бакалавриат",
"title": "Аппаратно-программный комплекс защиты сети",
"rating": 8,
"student": "Лесняк Владислав Евгеньевич",
"learnProgram": {"title": "Информатика и вычислительная техника", "url": "https://hse.ru/ba/isct/"},
"orgUnit": {"title": "МИЭМ", "url": "https://www.hse.ru/org/url/59315150"},
"supervisors": [{"url": "https://www.hse.ru/org/persons/803294906", "name": "Борисов Сергей Петрович"}],
}
],
}
)
def test_normalize_profile_url_supports_staff_and_org_persons():
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
@@ -26,3 +86,34 @@ def test_extract_person_tabs_prefers_person_menu_addition():
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
soup = BeautifulSoup(
"""
<script src="/n/stat/publications/dist-w/publs.js" data-author="568398853" data-widget-name="AuthorSearch"></script>
<script src="/n/stat/vkr/app.js" data-api-url="/n/vkr/api/" data-person-id="803294906"></script>
""",
"html.parser",
)
session = FakeSession()
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/803294906",
{"User-Agent": "test"},
10,
[],
)
publications = next(section for section in sections if section["type"] == "publications")
theses = next(section for section in sections if section["type"] == "graduation_theses")
assert publications["publications_count"] == 1
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
assert theses["theses_count"] == 1
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}