Compare commits

...

3 Commits

36 changed files with 2909 additions and 2 deletions

View File

@@ -41,6 +41,13 @@ uvicorn app.main:app --reload
Админка: `http://localhost:8000/admin`. Админка: `http://localhost:8000/admin`.
В админке доступны:
- `Dashboard`: общая статистика, последний добавленный сотрудник, прогресс текущего/последнего парсинга и ручной запуск.
- `Directory`: настраиваемая таблица сотрудников с фильтрами, сортировкой, пагинацией и выбором колонок.
- `Employees`: простая legacy-таблица сотрудников.
- `Runs`: история запусков, ошибки и progress bar.
## Docker Compose ## Docker Compose
```bash ```bash
@@ -57,7 +64,7 @@ docker compose up --build
## Парсинг ## Парсинг
Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на странице `Runs` или через REST: Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на `Dashboard` и странице `Runs` или через REST:
```bash ```bash
curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=..." curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=..."
@@ -67,9 +74,12 @@ curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=.
- найденные сотрудники получают статус `active` и обновленный `last_seen_at`; - найденные сотрудники получают статус `active` и обновленный `last_seen_at`;
- новые сотрудники добавляются в `employees`; - новые сотрудники добавляются в `employees`;
- количество новых сотрудников за запуск сохраняется в `crawl_runs.new_count`;
- активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`; - активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`;
- каждый успешный разбор сохраняет запись в `employee_snapshots`. - каждый успешный разбор сохраняет запись в `employee_snapshots`.
Во время выполнения парсинга `found_count`, `parsed_count` и `error_count` обновляются в базе. Админка опрашивает `/api/crawl-runs/latest` и показывает прогресс как `parsed_count + error_count / found_count`.
## MCP ## MCP
Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>`. Endpoint: `POST /mcp`, авторизация `Authorization: Bearer <MCP_TOKEN>`.
@@ -100,4 +110,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql
docker compose down docker compose down
``` ```
Версия сервиса: `0.1.0`. Админка всегда показывает версии backend и frontend в footer. Версия сервиса: `0.2.0`. Админка всегда показывает версии backend и frontend в footer.

1
app/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""MIEM employees service."""

204
app/admin.py Normal file
View File

@@ -0,0 +1,204 @@
from fastapi import APIRouter, BackgroundTasks, Depends, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from sqlalchemy import desc, func, or_, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlError, CrawlRun, Employee
from app.security import SESSION_COOKIE, require_admin, sign_session, verify_admin
from app.services.admin_data import list_employees_page, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/admin")
templates = Jinja2Templates(directory="app/templates")
@router.get("", response_class=HTMLResponse)
def dashboard(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings)
counts = stats_payload(db)
counts["runs"] = db.scalar(select(func.count()).select_from(CrawlRun)) or 0
counts["errors"] = db.scalar(select(func.count()).select_from(CrawlError)) or 0
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(10)).all()
return _render(request, "dashboard.html", {"counts": counts, "runs": runs, "latest_run": run_payload(runs[0]) if runs else None})
@router.get("/login", response_class=HTMLResponse)
def login_form(request: Request):
return _render(request, "login.html", {"error": None})
@router.post("/login")
def login(
request: Request,
username: str = Form(...),
password: str = Form(...),
settings: Settings = Depends(get_settings),
):
if not verify_admin(username, password, settings):
return _render(request, "login.html", {"error": "Неверный логин или пароль"}, status_code=401)
redirect = RedirectResponse("/admin", status_code=303)
redirect.set_cookie(SESSION_COOKIE, sign_session(username, settings), httponly=True, samesite="lax")
return redirect
@router.post("/logout")
def logout():
redirect = RedirectResponse("/admin/login", status_code=303)
redirect.delete_cookie(SESSION_COOKIE)
return redirect
@router.get("/employees", response_class=HTMLResponse)
def employees(
request: Request,
status: str | None = None,
q: str | None = None,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
stmt = select(Employee)
if status:
stmt = stmt.where(Employee.status == status)
if q:
pattern = f"%{q}%"
stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
items = db.scalars(stmt.order_by(Employee.full_name).limit(200)).all()
return _render(request, "employees.html", {"employees": items, "status": status or "", "q": q or ""})
@router.get("/directory", response_class=HTMLResponse)
def directory(
request: Request,
status: str | None = None,
q: str | None = None,
started_from: str | None = None,
started_to: str | None = None,
has_email: str | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
parsed_started_from = _parse_date(started_from)
parsed_started_to = _parse_date(started_to)
parsed_has_email = None if has_email in (None, "") else has_email == "true"
page = list_employees_page(
db,
status=status,
q=q,
started_from=parsed_started_from,
started_to=parsed_started_to,
has_email=parsed_has_email,
sort=sort,
direction=direction,
limit=limit,
offset=offset,
)
return _render(
request,
"directory.html",
{
"page": page,
"filters": {
"status": status or "",
"q": q or "",
"started_from": started_from or "",
"started_to": started_to or "",
"has_email": has_email or "",
"sort": sort,
"direction": direction,
"limit": limit,
"offset": offset,
},
},
)
@router.get("/employees/{employee_id}", response_class=HTMLResponse)
def employee_detail(
employee_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
employee = db.get(Employee, employee_id)
if not employee:
return RedirectResponse("/admin/employees", status_code=303)
snapshots = sorted(employee.snapshots, key=lambda item: item.captured_at, reverse=True)[:20]
return _render(request, "employee_detail.html", {"employee": employee, "snapshots": snapshots})
@router.get("/runs", response_class=HTMLResponse)
def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings)
items = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(50)).all()
errors = db.scalars(select(CrawlError).order_by(desc(CrawlError.created_at)).limit(50)).all()
return _render(request, "runs.html", {"runs": items, "errors": errors})
@router.post("/runs")
def trigger_run(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
if get_running_run(db):
return RedirectResponse("/admin/runs", status_code=303)
def _crawl() -> None:
with SessionLocal() as db:
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return RedirectResponse("/admin/runs", status_code=303)
@router.post("/crawl-now")
def crawl_now(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
if get_running_run(db):
return RedirectResponse("/admin", status_code=303)
def _crawl() -> None:
with SessionLocal() as db:
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return RedirectResponse("/admin", status_code=303)
def _render(request: Request, template: str, context: dict, status_code: int = 200) -> HTMLResponse:
payload = {
"request": request,
"backend_version": BACKEND_VERSION,
"frontend_version": FRONTEND_VERSION,
**context,
}
return templates.TemplateResponse(template, payload, status_code=status_code)
def _parse_date(value: str | None):
if not value:
return None
try:
from datetime import date
return date.fromisoformat(value)
except ValueError:
return None

133
app/api.py Normal file
View File

@@ -0,0 +1,133 @@
from datetime import date
from fastapi import APIRouter, BackgroundTasks, Depends, Request
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import SessionLocal, get_db
from app.models import CrawlRun, Employee
from app.security import require_admin
from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/api")
@router.get("/health")
def health() -> dict:
return {"status": "ok", "backend_version": BACKEND_VERSION, "frontend_version": FRONTEND_VERSION}
@router.get("/employees")
def list_employees(
request: Request,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
return list_employees_page(
db,
status=status,
q=q,
started_from=started_from,
started_to=started_to,
has_email=has_email,
sort=sort,
direction=direction,
limit=limit,
offset=offset,
)
@router.get("/employees/{employee_id}")
def get_employee(
employee_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
employee = db.get(Employee, employee_id)
if not employee:
return {"error": "not_found"}
return _employee_detail(employee)
@router.get("/crawl-runs")
def list_crawl_runs(
request: Request,
limit: int = 20,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
runs = db.scalars(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(limit)).all()
return {"items": [run_payload(run) for run in runs]}
@router.get("/crawl-runs/latest")
def latest_crawl_run(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
running = get_running_run(db)
latest = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return {"running": run_payload(running), "latest": run_payload(latest)}
@router.post("/crawl-runs")
def trigger_crawl(
request: Request,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
running = get_running_run(db)
if running:
return {"status": "already_running", "run": run_payload(running)}
def _crawl() -> None:
with SessionLocal() as db:
run_crawl_if_idle(db, settings)
background_tasks.add_task(_crawl)
return {"status": "scheduled"}
@router.get("/stats")
def stats(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_admin(request, settings)
return stats_payload(db)
def _employee_summary(employee: Employee) -> dict:
return employee_display_payload(employee)
def _employee_detail(employee: Employee) -> dict:
data = _employee_summary(employee)
data["current_data"] = employee.current_data
data["tabs"] = [{"title": tab.title, "href": tab.href, "data_index": tab.data_index} for tab in employee.tabs]
return data
def _run_summary(run: CrawlRun) -> dict:
return run_payload(run) or {}

25
app/config.py Normal file
View File

@@ -0,0 +1,25 @@
from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
database_url: str = "sqlite:///./miem_workers.db"
source_url: str = "https://miem.hse.ru/persons"
crawl_cron: str = "0 3 * * 1"
crawl_limit: int | None = None
request_timeout: int = 30
request_delay_seconds: float = 1.0
parser_use_playwright: bool = False
admin_username: str = "admin"
admin_password: str = "admin"
session_secret: str = Field(default="dev-session-secret", min_length=8)
mcp_token: str = "dev-mcp-token"
@lru_cache
def get_settings() -> Settings:
return Settings()

35
app/db.py Normal file
View File

@@ -0,0 +1,35 @@
from collections.abc import Generator
from sqlalchemy import create_engine
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
from app.config import get_settings
class Base(DeclarativeBase):
pass
def _connect_args(database_url: str) -> dict[str, object]:
if database_url.startswith("sqlite"):
return {"check_same_thread": False}
return {}
settings = get_settings()
engine = create_engine(settings.database_url, connect_args=_connect_args(settings.database_url))
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False)
def init_db() -> None:
import app.models # noqa: F401
Base.metadata.create_all(bind=engine)
def get_db() -> Generator[Session, None, None]:
db = SessionLocal()
try:
yield db
finally:
db.close()

24
app/main.py Normal file
View File

@@ -0,0 +1,24 @@
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from app.admin import router as admin_router
from app.api import router as api_router
from app.db import init_db
from app.mcp import router as mcp_router
from app.version import BACKEND_VERSION
app = FastAPI(title="MIEM Employees", version=BACKEND_VERSION)
app.mount("/static", StaticFiles(directory="app/static"), name="static")
app.include_router(api_router)
app.include_router(admin_router)
app.include_router(mcp_router)
@app.on_event("startup")
def startup() -> None:
init_db()
@app.get("/")
def root() -> dict:
return {"service": "miem-employees", "version": BACKEND_VERSION, "admin": "/admin"}

170
app/mcp.py Normal file
View File

@@ -0,0 +1,170 @@
import json
from fastapi import APIRouter, Depends, Request
from sqlalchemy import desc, or_, select
from sqlalchemy.orm import Session
from app.config import Settings, get_settings
from app.db import get_db
from app.models import CrawlRun, Employee
from app.security import require_mcp_token
router = APIRouter(prefix="/mcp")
TOOLS = [
{
"name": "search_employees",
"description": "Search MIEM employees by name or profile URL.",
"inputSchema": {
"type": "object",
"properties": {
"query": {"type": "string"},
"status": {"type": "string", "enum": ["active", "dismissed"]},
"limit": {"type": "integer", "default": 20},
},
"required": ["query"],
},
},
{
"name": "get_employee",
"description": "Get one employee by profile id, profile key, or canonical URL.",
"inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
},
{
"name": "list_employee_publications",
"description": "List publications parsed from an employee profile.",
"inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
},
{
"name": "list_employee_courses",
"description": "List teaching courses parsed from an employee profile.",
"inputSchema": {"type": "object", "properties": {"profile_id_or_url": {"type": "string"}}, "required": ["profile_id_or_url"]},
},
{
"name": "get_crawl_status",
"description": "Return the latest crawl run status.",
"inputSchema": {"type": "object", "properties": {}},
},
]
@router.post("")
async def mcp_http(
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
) -> dict:
require_mcp_token(request, settings)
payload = await request.json()
method = payload.get("method")
request_id = payload.get("id")
params = payload.get("params") or {}
try:
if method == "initialize":
result = {
"protocolVersion": "2024-11-05",
"serverInfo": {"name": "miem-employees", "version": "0.1.0"},
"capabilities": {"tools": {}},
}
elif method == "tools/list":
result = {"tools": TOOLS}
elif method == "tools/call":
result = _call_tool(db, params.get("name"), params.get("arguments") or {})
else:
return {"jsonrpc": "2.0", "id": request_id, "error": {"code": -32601, "message": "Method not found"}}
return {"jsonrpc": "2.0", "id": request_id, "result": result}
except Exception as exc:
return {"jsonrpc": "2.0", "id": request_id, "error": {"code": -32000, "message": str(exc)}}
def _call_tool(db: Session, name: str, arguments: dict) -> dict:
if name == "search_employees":
return _tool_response(_search_employees(db, arguments))
if name == "get_employee":
employee = _find_employee(db, arguments["profile_id_or_url"])
return _tool_response(_employee_payload(employee) if employee else {"error": "not_found"})
if name == "list_employee_publications":
employee = _find_employee(db, arguments["profile_id_or_url"])
return _tool_response(_collect_section_items(employee, "publications"))
if name == "list_employee_courses":
employee = _find_employee(db, arguments["profile_id_or_url"])
return _tool_response(_collect_section_items(employee, "courses_by_year"))
if name == "get_crawl_status":
run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
return _tool_response(_run_payload(run) if run else {"status": "never_run"})
raise ValueError(f"Unknown tool: {name}")
def _search_employees(db: Session, arguments: dict) -> list[dict]:
query = arguments.get("query", "")
limit = min(int(arguments.get("limit") or 20), 100)
stmt = select(Employee)
if arguments.get("status"):
stmt = stmt.where(Employee.status == arguments["status"])
if query:
pattern = f"%{query}%"
stmt = stmt.where(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
employees = db.scalars(stmt.order_by(Employee.full_name).limit(limit)).all()
return [_employee_payload(employee, include_data=False) for employee in employees]
def _find_employee(db: Session, value: str) -> Employee | None:
pattern = value.strip()
stmt = select(Employee).where(
or_(
Employee.profile_key == pattern,
Employee.profile_id == pattern,
Employee.canonical_url == pattern,
Employee.canonical_url.ilike(f"%{pattern}%"),
)
)
return db.scalar(stmt.limit(1))
def _collect_section_items(employee: Employee | None, section_type: str) -> dict:
if not employee or not employee.current_data:
return {"items": []}
items = []
for section in employee.current_data.get("sections") or []:
if section.get("type") != section_type:
continue
if section_type == "publications":
items.extend(section.get("publications") or [])
elif section_type == "courses_by_year":
items.extend(section.get("courses") or [])
return {"employee": _employee_payload(employee, include_data=False), "items": items}
def _employee_payload(employee: Employee, include_data: bool = True) -> dict:
payload = {
"profile_key": employee.profile_key,
"profile_id": employee.profile_id,
"full_name": employee.full_name,
"status": employee.status,
"canonical_url": employee.canonical_url,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
}
if include_data:
payload["data"] = employee.current_data
return payload
def _run_payload(run: CrawlRun) -> dict:
return {
"id": run.id,
"status": run.status,
"source_url": run.source_url,
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"error_count": run.error_count,
"dismissed_count": run.dismissed_count,
}
def _tool_response(data: object) -> dict:
return {"content": [{"type": "text", "text": json.dumps(data, ensure_ascii=False, default=str)}]}

110
app/models.py Normal file
View File

@@ -0,0 +1,110 @@
from datetime import datetime, timezone
from sqlalchemy import DateTime, ForeignKey, Index, Integer, LargeBinary, String, Text, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy.types import JSON
from app.db import Base
def utcnow() -> datetime:
return datetime.now(timezone.utc)
json_type = JSON().with_variant(JSONB, "postgresql")
class Employee(Base):
__tablename__ = "employees"
__table_args__ = (
UniqueConstraint("profile_key", name="uq_employees_profile_key"),
Index("ix_employees_full_name", "full_name"),
Index("ix_employees_status", "status"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
profile_type: Mapped[str | None] = mapped_column(String(50))
profile_id: Mapped[str | None] = mapped_column(String(255))
canonical_url: Mapped[str] = mapped_column(Text, nullable=False)
full_name: Mapped[str | None] = mapped_column(Text)
status: Mapped[str] = mapped_column(String(32), default="active", nullable=False)
first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
last_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
dismissed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
parser_version: Mapped[str | None] = mapped_column(String(32))
current_data: Mapped[dict | None] = mapped_column(json_type)
current_checksum: Mapped[str | None] = mapped_column(String(64))
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False)
snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee")
tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan")
class EmployeeSnapshot(Base):
__tablename__ = "employee_snapshots"
__table_args__ = (Index("ix_employee_snapshots_employee_id", "employee_id"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False)
crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id"))
parsed_data: Mapped[dict] = mapped_column(json_type, nullable=False)
html_snapshot: Mapped[bytes | None] = mapped_column(LargeBinary)
checksum: Mapped[str] = mapped_column(String(64), nullable=False)
parser_version: Mapped[str | None] = mapped_column(String(32))
captured_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
employee: Mapped[Employee] = relationship(back_populates="snapshots")
class CrawlRun(Base):
__tablename__ = "crawl_runs"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
source_url: Mapped[str] = mapped_column(Text, nullable=False)
status: Mapped[str] = mapped_column(String(32), default="running", nullable=False)
started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
message: Mapped[str | None] = mapped_column(Text)
class CrawlError(Base):
__tablename__ = "crawl_errors"
__table_args__ = (Index("ix_crawl_errors_run_id", "crawl_run_id"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
crawl_run_id: Mapped[int] = mapped_column(ForeignKey("crawl_runs.id"), nullable=False)
profile_url: Mapped[str | None] = mapped_column(Text)
error_type: Mapped[str] = mapped_column(String(255), nullable=False)
message: Mapped[str] = mapped_column(Text, nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
class ProfileTab(Base):
__tablename__ = "profile_tabs"
__table_args__ = (Index("ix_profile_tabs_employee_id", "employee_id"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id"), nullable=False)
title: Mapped[str] = mapped_column(Text, nullable=False)
href: Mapped[str] = mapped_column(Text, nullable=False)
data_index: Mapped[str | None] = mapped_column(String(64))
employee: Mapped[Employee] = relationship(back_populates="tabs")
class ParserSource(Base):
__tablename__ = "parser_sources"
__table_args__ = (UniqueConstraint("source_url", name="uq_parser_sources_source_url"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True)
source_url: Mapped[str] = mapped_column(Text, nullable=False)
enabled: Mapped[bool] = mapped_column(default=True, nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)

1
app/parser/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""HTML parsing helpers for HSE/MIEM employee pages."""

19
app/parser/collector.py Normal file
View File

@@ -0,0 +1,19 @@
from bs4 import BeautifulSoup
from requests import Session
from app.parser.profile_url import normalize_profile_url
def collect_profile_links(session: Session, source_url: str, headers: dict[str, str], timeout: int) -> list[str]:
response = session.get(source_url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen: set[str] = set()
collected: list[str] = []
for anchor in soup.find_all("a", href=True):
normalized = normalize_profile_url(anchor["href"])
if normalized and normalized not in seen:
seen.add(normalized)
collected.append(normalized)
return collected

380
app/parser/profile.py Normal file
View File

@@ -0,0 +1,380 @@
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString, Tag
from requests import Session
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
from app.version import BACKEND_VERSION
_YEAR_PATTERN = re.compile(r"Начал[аи]?\s+работать.*?ВШЭ.*?(\d{4})", re.IGNORECASE)
_EMAIL_PATTERN = re.compile(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})")
_PHONE_PATTERN = re.compile(r"(?:Телефон|Phone)\s*:\s*([+()\d\-\s]{8,})", re.IGNORECASE)
def normalize_ws(value: str | None) -> str:
return re.sub(r"\s+", " ", value or "").strip()
def extract_person_tabs(soup: BeautifulSoup, source_url: str) -> list[dict[str, str | None]]:
selectors = (
"div.person-menu.is-desktop.small.person-menu-addition",
".person-menu",
)
for selector in selectors:
menu = soup.select_one(selector)
if not menu:
continue
tabs = []
for anchor in menu.select("a[href]"):
title = normalize_ws(anchor.get_text(" ", strip=True))
href = anchor.get("href", "").strip()
if title and href:
tabs.append(
{
"data_index": anchor.get("data-index"),
"title": title,
"href": urljoin(source_url, href),
}
)
if tabs:
return _dedupe_tabs(tabs)
return []
def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict:
name_node = soup.select_one("h1.person-caption") or soup.find("h1")
text = normalize_ws(soup.get_text(" ", strip=True))
year_match = _YEAR_PATTERN.search(text)
contacts = {"phones": [], "emails": [], "address": None, "items": []}
for email in _EMAIL_PATTERN.findall(text):
if email not in contacts["emails"]:
contacts["emails"].append(email)
for phone in _PHONE_PATTERN.findall(text):
normalized_phone = normalize_ws(phone)
if normalized_phone and normalized_phone not in contacts["phones"]:
contacts["phones"].append(normalized_phone)
address_match = re.search(
r"(Адрес[:\s].{0,220}?)(?:\s+Время|\s+Расписание|\s+SPIN|\s+ORCID|$)",
text,
flags=re.IGNORECASE,
)
if address_match:
contacts["address"] = normalize_ws(address_match.group(1)).rstrip(",")
positions = []
for li in soup.select("ul.g-ul.g-list.small.employment-add li, ul.employment-add li"):
value = normalize_ws(li.get_text(" ", strip=True))
if value:
positions.append(value)
external_ids = []
id_domains = (
("ORCID", "orcid.org"),
("Scopus AuthorID", "scopus.com"),
("ResearcherID", "webofscience.com"),
("Google Scholar", "scholar.google."),
("SPIN РИНЦ", "elibrary.ru"),
)
for anchor in soup.select("a[href]"):
href = anchor.get("href", "").strip()
label = normalize_ws(anchor.get_text(" ", strip=True))
for system, marker in id_domains:
if marker in href:
external_ids.append({"system": system, "value": label or system, "url": href})
break
return {
"source_url": source_url,
"full_name": normalize_ws(name_node.get_text(" ", strip=True)) if name_node else None,
"positions": positions,
"hse_start_year": int(year_match.group(1)) if year_match else None,
"contacts": contacts,
"external_ids": _dedupe_dicts(external_ids),
}
def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
sections = []
for h2 in soup.select("h2"):
title = normalize_ws(h2.get_text(" ", strip=True))
if not title or "расписание занятий" in title.lower():
continue
nodes = _collect_between_h2(h2)
raw_text = _nodes_raw_text(nodes)
paragraphs = _nodes_paragraphs(nodes)
items = _nodes_list_items(nodes)
links = []
for node in nodes:
if isinstance(node, Tag):
links.extend(_extract_links(node, source_url))
section_type = _infer_section_type(title, nodes)
section = {
"title": title,
"slug": _slugify(title),
"type": section_type,
"raw_text": raw_text,
"paragraphs": paragraphs,
"items": items,
"links": links,
}
if section_type == "publications":
section["publications_count"], section["publications"] = _parse_publications(title, nodes, source_url)
section["items"] = [item["text"] for item in section["publications"] if item.get("text")]
elif section_type == "courses_by_year":
section["academic_year"], section["courses"] = _parse_courses(title, nodes, source_url)
section.pop("items", None)
section.pop("links", None)
elif section_type == "table":
section["table"] = _parse_table(nodes, source_url)
elif "выпускные квалификационные работы студентов ниу вшэ" in title.lower():
section["items"] = _parse_vkr_items(nodes)
year_entries = _parse_year_entries(nodes, source_url)
if year_entries:
section["year_entries"] = year_entries
if section_type in {"generic", "paragraphs"}:
section["type"] = "year_blocks"
sections.append(section)
return sections
def parse_person_profile(
session: Session,
source_url: str,
headers: dict[str, str],
timeout: int,
use_playwright: bool = False,
) -> dict | None:
normalized_url = normalize_profile_url(source_url)
if not normalized_url:
return None
response = session.get(normalized_url, headers=headers, timeout=timeout)
response.raise_for_status()
html = response.text
if use_playwright:
html = _render_with_playwright(normalized_url, html)
soup = BeautifulSoup(html, "html.parser")
profile_type, profile_id = parse_profile_identity(normalized_url)
header = extract_person_header(soup, normalized_url)
tabs = extract_person_tabs(soup, normalized_url)
sections = extract_sections(soup, normalized_url)
internal_links = [tab["href"] for tab in tabs if tab.get("href")]
return {
"source_url": normalized_url,
"profile_type": profile_type,
"profile_id": profile_id,
"full_name": header.get("full_name"),
"positions": header.get("positions") or [],
"hse_start_year": header.get("hse_start_year"),
"contacts": header.get("contacts") or {},
"external_ids": header.get("external_ids") or [],
"tabs": tabs,
"sections": sections,
"employee_internal_links": internal_links,
"parser_version": BACKEND_VERSION,
"_html": html,
}
def _render_with_playwright(source_url: str, fallback_html: str) -> str:
try:
from playwright.sync_api import sync_playwright
except Exception:
return fallback_html
try:
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
page = browser.new_page()
page.goto(source_url, wait_until="domcontentloaded", timeout=45000)
for index in range(page.locator(".person-menu a").count()):
try:
page.locator(".person-menu a").nth(index).click(timeout=2500, force=True)
page.wait_for_timeout(450)
except Exception:
continue
html = page.content()
browser.close()
return html
except Exception:
return fallback_html
def _collect_between_h2(start_h2: Tag) -> list[Tag | NavigableString | str]:
nodes = []
for sibling in start_h2.next_siblings:
if isinstance(sibling, Tag) and sibling.name == "h2":
break
if isinstance(sibling, NavigableString) and not normalize_ws(str(sibling)):
continue
nodes.append(sibling)
return nodes
def _extract_links(node: Tag, source_url: str) -> list[dict[str, str]]:
links = []
for anchor in node.select("a[href]"):
text = normalize_ws(anchor.get_text(" ", strip=True))
href = anchor.get("href", "").strip()
if text and href and "timetable" not in href.lower() and "расписание" not in text.lower():
links.append({"text": text, "url": urljoin(source_url, href)})
return links
def _nodes_raw_text(nodes: list) -> str:
chunks = []
for node in nodes:
text = normalize_ws(node.get_text(" ", strip=True) if isinstance(node, Tag) else str(node))
if text:
chunks.append(text)
return "\n".join(chunks)
def _nodes_paragraphs(nodes: list) -> list[str]:
paragraphs = []
for node in nodes:
if isinstance(node, Tag):
paragraphs.extend(normalize_ws(p.get_text(" ", strip=True)) for p in node.select("p"))
return [p for p in paragraphs if p]
def _nodes_list_items(nodes: list) -> list[str]:
items = []
for node in nodes:
if isinstance(node, Tag):
items.extend(normalize_ws(li.get_text(" ", strip=True)) for li in node.select("li"))
return [item for item in items if item and "расписание" not in item.lower()]
def _infer_section_type(title: str, nodes: list) -> str:
lowered = title.lower()
if _has_table(nodes):
return "table"
if "публикац" in lowered:
return "publications"
if "учебные курсы" in lowered:
return "courses_by_year"
if _nodes_list_items(nodes):
return "list"
if _nodes_paragraphs(nodes):
return "paragraphs"
return "generic"
def _has_table(nodes: list) -> bool:
return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes)
def _parse_table(nodes: list, source_url: str) -> dict:
for node in nodes:
if not isinstance(node, Tag):
continue
table = node if node.name == "table" else node.find("table")
if not table:
continue
headers = [normalize_ws(th.get_text(" ", strip=True)) for th in table.select("th")]
rows = []
for tr in table.select("tr"):
cells = [normalize_ws(td.get_text(" ", strip=True)) for td in tr.select("td")]
if cells:
link = tr.find("a", href=True)
rows.append({"cells": cells, "link_url": urljoin(source_url, link["href"]) if link else None})
return {"headers": headers, "rows": rows}
return {"headers": [], "rows": []}
def _parse_publications(title: str, nodes: list, source_url: str) -> tuple[int | None, list[dict]]:
count_match = re.search(r"(\d+)\s*$", title)
publications = []
for node in nodes:
if not isinstance(node, Tag):
continue
for li in node.select("li"):
text = normalize_ws(li.get_text(" ", strip=True))
anchor = li.find("a", href=True)
if text:
publications.append(
{
"title": normalize_ws(anchor.get_text(" ", strip=True)) if anchor else text,
"url": urljoin(source_url, anchor["href"]) if anchor else None,
"text": text,
}
)
if publications:
break
if not publications:
publications = [{"title": line, "url": None, "text": line} for line in _nodes_raw_text(nodes).split("\n") if line]
return int(count_match.group(1)) if count_match else None, publications
def _parse_courses(title: str, nodes: list, source_url: str) -> tuple[str | None, list[dict]]:
year_match = re.search(r"(\d{4}/\d{4})", title)
courses = []
for node in nodes:
if isinstance(node, Tag):
for li in node.select("li"):
anchor = li.find("a", href=True)
course_title = normalize_ws(anchor.get_text(" ", strip=True) if anchor else li.get_text(" ", strip=True))
if course_title:
courses.append({"title": course_title, "url": urljoin(source_url, anchor["href"]) if anchor else None})
return year_match.group(1) if year_match else None, _dedupe_dicts(courses)
def _parse_year_entries(nodes: list, source_url: str) -> list[dict]:
entries = []
for node in nodes:
if not isinstance(node, Tag):
continue
for year_node in node.select(".person-list-hangover"):
year_match = re.search(r"(19\d{2}|20\d{2})", year_node.get_text(" ", strip=True))
parent = year_node.parent
if parent:
entries.append(
{
"year": int(year_match.group(1)) if year_match else None,
"text": normalize_ws(parent.get_text(" ", strip=True)),
"links": _extract_links(parent, source_url),
}
)
return entries
def _parse_vkr_items(nodes: list) -> list[str]:
items = []
for node in nodes:
if isinstance(node, Tag):
items.extend(normalize_ws(li.get_text(" ", strip=True)) for li in node.select("li"))
return [item for item in dict.fromkeys(items) if item]
def _slugify(value: str) -> str:
cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"
def _dedupe_tabs(items: list[dict]) -> list[dict]:
seen = set()
unique = []
for item in items:
key = (item.get("title"), item.get("href"))
if key not in seen:
seen.add(key)
unique.append(item)
return unique
def _dedupe_dicts(items: list[dict]) -> list[dict]:
seen = set()
unique = []
for item in items:
key = tuple(sorted(item.items()))
if key not in seen:
seen.add(key)
unique.append(item)
return unique

46
app/parser/profile_url.py Normal file
View File

@@ -0,0 +1,46 @@
import re
from urllib.parse import urljoin, urlsplit, urlunsplit
BASE_URL = "https://www.hse.ru"
_ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$")
_STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$")
def normalize_profile_url(href: str | None) -> str | None:
if not href:
return None
candidate = urljoin(BASE_URL + "/", href.strip())
split = urlsplit(candidate)
path = split.path.rstrip("/")
org_match = _ORG_PATTERN.match(path + "/")
if org_match:
return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", ""))
staff_match = _STAFF_PATTERN.match(path + "/")
if staff_match:
return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", ""))
return None
def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]:
normalized = normalize_profile_url(profile_url)
if not normalized:
return None, None
path = urlsplit(normalized).path.rstrip("/")
org_match = _ORG_PATTERN.match(path + "/")
if org_match:
return "org_person", org_match.group(1)
staff_match = _STAFF_PATTERN.match(path + "/")
if staff_match:
return "staff", staff_match.group(1)
return None, None
def profile_key(profile_url: str) -> str | None:
profile_type, profile_id = parse_profile_identity(profile_url)
if not profile_type or not profile_id:
return None
return f"{profile_type}:{profile_id}"

52
app/security.py Normal file
View File

@@ -0,0 +1,52 @@
import base64
import hashlib
import hmac
import json
import time
from fastapi import HTTPException, Request, status
from app.config import Settings
SESSION_COOKIE = "miem_admin_session"
def verify_admin(username: str, password: str, settings: Settings) -> bool:
return hmac.compare_digest(username, settings.admin_username) and hmac.compare_digest(
password, settings.admin_password
)
def sign_session(username: str, settings: Settings) -> str:
payload = base64.urlsafe_b64encode(
json.dumps({"sub": username, "iat": int(time.time())}, separators=(",", ":")).encode("utf-8")
).decode("ascii")
signature = hmac.new(settings.session_secret.encode("utf-8"), payload.encode("ascii"), hashlib.sha256).hexdigest()
return f"{payload}.{signature}"
def read_session(token: str | None, settings: Settings) -> str | None:
if not token or "." not in token:
return None
payload, signature = token.rsplit(".", 1)
expected = hmac.new(settings.session_secret.encode("utf-8"), payload.encode("ascii"), hashlib.sha256).hexdigest()
if not hmac.compare_digest(signature, expected):
return None
try:
data = json.loads(base64.urlsafe_b64decode(payload.encode("ascii")))
except Exception:
return None
return data.get("sub")
def require_admin(request: Request, settings: Settings) -> str:
username = read_session(request.cookies.get(SESSION_COOKIE), settings)
if not username:
raise HTTPException(status_code=status.HTTP_303_SEE_OTHER, headers={"Location": "/admin/login"})
return username
def require_mcp_token(request: Request, settings: Settings) -> None:
auth = request.headers.get("authorization", "")
if not auth.startswith("Bearer ") or not hmac.compare_digest(auth.removeprefix("Bearer ").strip(), settings.mcp_token):
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid MCP token")

1
app/services/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Application services."""

159
app/services/admin_data.py Normal file
View File

@@ -0,0 +1,159 @@
from __future__ import annotations
from datetime import date, datetime, time
from math import ceil
from typing import Any
from sqlalchemy import Select, Text, and_, desc, func, or_, select
from sqlalchemy.orm import Session
from app.models import CrawlRun, Employee
EMPLOYEE_SORTS = {
"full_name": Employee.full_name,
"status": Employee.status,
"first_seen_at": Employee.first_seen_at,
"last_seen_at": Employee.last_seen_at,
"dismissed_at": Employee.dismissed_at,
"hse_start_year": Employee.current_data["hse_start_year"].as_integer(),
}
def employee_display_payload(employee: Employee) -> dict[str, Any]:
data = employee.current_data or {}
contacts = data.get("contacts") or {}
sections = data.get("sections") or []
emails = contacts.get("emails") or []
phones = contacts.get("phones") or []
return {
"id": employee.id,
"full_name": employee.full_name,
"status": employee.status,
"canonical_url": employee.canonical_url,
"positions": data.get("positions") or [],
"positions_text": "; ".join(data.get("positions") or []),
"hse_start_year": data.get("hse_start_year"),
"emails": emails,
"email_text": ", ".join(emails),
"phones": phones,
"phone_text": ", ".join(phones),
"address": contacts.get("address"),
"publications_count": _count_section_items(sections, "publications"),
"courses_count": _count_section_items(sections, "courses_by_year"),
"first_seen_at": employee.first_seen_at.isoformat() if employee.first_seen_at else None,
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
}
def build_employee_query(
*,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
) -> Select[tuple[Employee]]:
stmt = select(Employee)
filters = []
if status:
filters.append(Employee.status == status)
if q:
pattern = f"%{q}%"
filters.append(or_(Employee.full_name.ilike(pattern), Employee.canonical_url.ilike(pattern)))
if started_from:
filters.append(Employee.first_seen_at >= datetime.combine(started_from, time.min))
if started_to:
filters.append(Employee.first_seen_at <= datetime.combine(started_to, time.max))
if has_email is True:
filters.append(Employee.current_data.cast(Text).ilike("%@%"))
elif has_email is False:
filters.append(or_(Employee.current_data.is_(None), ~Employee.current_data.cast(Text).ilike("%@%")))
if filters:
stmt = stmt.where(and_(*filters))
return stmt
def list_employees_page(
db: Session,
*,
status: str | None = None,
q: str | None = None,
started_from: date | None = None,
started_to: date | None = None,
has_email: bool | None = None,
sort: str = "full_name",
direction: str = "asc",
limit: int = 50,
offset: int = 0,
) -> dict[str, Any]:
limit = max(1, min(limit, 200))
offset = max(0, offset)
base_stmt = build_employee_query(
status=status,
q=q,
started_from=started_from,
started_to=started_to,
has_email=has_email,
)
total = db.scalar(select(func.count()).select_from(base_stmt.subquery())) or 0
sort_column = EMPLOYEE_SORTS.get(sort, Employee.full_name)
order = desc(sort_column) if direction == "desc" else sort_column
employees = db.scalars(base_stmt.order_by(order).limit(limit).offset(offset)).all()
return {
"items": [employee_display_payload(employee) for employee in employees],
"total": total,
"limit": limit,
"offset": offset,
"pages": ceil(total / limit) if total else 0,
"page": (offset // limit) + 1,
}
def stats_payload(db: Session) -> dict[str, Any]:
latest_run = db.scalar(select(CrawlRun).order_by(desc(CrawlRun.started_at)).limit(1))
running_run = db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
latest_added = db.scalar(select(Employee).order_by(desc(Employee.first_seen_at)).limit(1))
return {
"total": db.scalar(select(func.count()).select_from(Employee)) or 0,
"active": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "active")) or 0,
"dismissed": db.scalar(select(func.count()).select_from(Employee).where(Employee.status == "dismissed")) or 0,
"new_in_last_run": latest_run.new_count if latest_run else 0,
"latest_added": employee_display_payload(latest_added) if latest_added else None,
"latest_run": run_payload(latest_run) if latest_run else None,
"current_running_run": run_payload(running_run) if running_run else None,
}
def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
if not run:
return None
processed = run.parsed_count + run.error_count
percent = round((processed / run.found_count) * 100, 1) if run.found_count else 0
return {
"id": run.id,
"source_url": run.source_url,
"status": run.status,
"started_at": run.started_at.isoformat() if run.started_at else None,
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
"found_count": run.found_count,
"parsed_count": run.parsed_count,
"new_count": run.new_count,
"error_count": run.error_count,
"dismissed_count": run.dismissed_count,
"processed_count": processed,
"progress_percent": percent,
"message": run.message,
}
def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> int:
total = 0
for section in sections:
if section.get("type") != section_type:
continue
if section_type == "publications":
total += len(section.get("publications") or section.get("items") or [])
elif section_type == "courses_by_year":
total += len(section.get("courses") or [])
return total

View File

@@ -0,0 +1,17 @@
from sqlalchemy import desc, select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlRun
from app.services.crawler import run_crawl
def get_running_run(db: Session) -> CrawlRun | None:
return db.scalar(select(CrawlRun).where(CrawlRun.status == "running").order_by(desc(CrawlRun.started_at)).limit(1))
def run_crawl_if_idle(db: Session, settings: Settings) -> tuple[CrawlRun, bool]:
running = get_running_run(db)
if running:
return running, False
return run_crawl(db, settings), True

160
app/services/crawler.py Normal file
View File

@@ -0,0 +1,160 @@
import gzip
import hashlib
import json
import time
from datetime import datetime, timezone
import requests
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
}
def run_crawl(db: Session, settings: Settings) -> CrawlRun:
source = _ensure_source(db, settings.source_url)
run = CrawlRun(source_url=source.source_url, status="running")
db.add(run)
db.commit()
db.refresh(run)
found_keys: set[str] = set()
parsed_count = 0
try:
with requests.Session() as session:
urls = collect_profile_links(session, source.source_url, HEADERS, settings.request_timeout)
if settings.crawl_limit:
urls = urls[: settings.crawl_limit]
run.found_count = len(urls)
db.commit()
for url in urls:
key = profile_key(url)
if key:
found_keys.add(key)
try:
parsed = parse_person_profile(
session,
url,
HEADERS,
settings.request_timeout,
settings.parser_use_playwright,
)
if not parsed:
continue
_upsert_employee(db, run, parsed)
parsed_count += 1
run.parsed_count = parsed_count
db.commit()
except Exception as exc:
run.error_count += 1
db.add(
CrawlError(
crawl_run_id=run.id,
profile_url=url,
error_type=type(exc).__name__,
message=str(exc),
)
)
db.commit()
finally:
time.sleep(settings.request_delay_seconds)
run.dismissed_count = _mark_dismissed(db, found_keys)
run.status = "completed"
except Exception as exc:
run.status = "failed"
run.message = str(exc)
finally:
run.finished_at = datetime.now(timezone.utc)
db.commit()
db.refresh(run)
return run
def _ensure_source(db: Session, source_url: str) -> ParserSource:
source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url))
if source:
return source
source = ParserSource(source_url=source_url, enabled=True)
db.add(source)
db.commit()
db.refresh(source)
return source
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
html = parsed.pop("_html", None)
checksum = _checksum(parsed)
key = f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
employee = db.scalar(select(Employee).where(Employee.profile_key == key))
now = datetime.now(timezone.utc)
if not employee:
employee = Employee(
profile_key=key,
profile_type=parsed.get("profile_type"),
profile_id=parsed.get("profile_id"),
canonical_url=parsed["source_url"],
first_seen_at=now,
)
db.add(employee)
run.new_count += 1
employee.full_name = parsed.get("full_name")
employee.status = "active"
employee.last_seen_at = now
employee.dismissed_at = None
employee.parser_version = parsed.get("parser_version")
employee.current_data = parsed
employee.current_checksum = checksum
db.flush()
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
db.add(
ProfileTab(
employee_id=employee.id,
title=tab.get("title") or "",
href=tab.get("href") or "",
data_index=tab.get("data_index"),
)
)
db.add(
EmployeeSnapshot(
employee_id=employee.id,
crawl_run_id=run.id,
parsed_data=parsed,
html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
checksum=checksum,
parser_version=parsed.get("parser_version"),
)
)
return employee
def _mark_dismissed(db: Session, found_keys: set[str]) -> int:
dismissed = 0
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
now = datetime.now(timezone.utc)
for employee in active:
if employee.profile_key in found_keys:
continue
employee.status = "dismissed"
employee.dismissed_at = now
dismissed += 1
db.commit()
return dismissed
def _checksum(data: dict) -> str:
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()

412
app/static/admin.css Normal file
View File

@@ -0,0 +1,412 @@
.admin {
margin: 0;
min-height: 100vh;
color: #1f2937;
background: #f6f7f9;
font-family: Arial, sans-serif;
}
.admin__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 24px;
padding: 18px 32px;
background: #ffffff;
border-bottom: 1px solid #d9dee7;
}
.admin__brand {
margin: 0;
font-size: 20px;
}
.admin__nav {
display: flex;
align-items: center;
gap: 14px;
}
.admin__link {
color: #0f766e;
text-decoration: none;
font-weight: 700;
}
.admin__main {
width: min(1180px, calc(100% - 32px));
margin: 28px auto;
}
.admin__footer {
padding: 20px 32px;
color: #6b7280;
border-top: 1px solid #d9dee7;
background: #ffffff;
}
.admin__grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 16px;
}
.metric {
padding: 18px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.metric__label {
color: #6b7280;
font-size: 13px;
}
.metric__value {
margin-top: 8px;
font-size: 28px;
font-weight: 700;
}
.panel {
margin-top: 22px;
padding: 20px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.panel__title {
margin: 0 0 16px;
font-size: 18px;
}
.table {
width: 100%;
border-collapse: collapse;
}
.table__cell,
.table__head {
padding: 10px 8px;
border-bottom: 1px solid #e5e7eb;
text-align: left;
vertical-align: top;
}
.badge {
display: inline-block;
padding: 3px 8px;
border-radius: 999px;
background: #e0f2fe;
color: #075985;
font-size: 12px;
}
.badge--dismissed {
background: #fee2e2;
color: #991b1b;
}
.form {
display: grid;
gap: 12px;
max-width: 380px;
}
.form__label {
display: grid;
gap: 6px;
font-weight: 700;
}
.form__input,
.form__select {
padding: 10px 12px;
border: 1px solid #cbd5e1;
border-radius: 6px;
}
.button {
padding: 10px 14px;
border: 0;
border-radius: 6px;
color: #ffffff;
background: #0f766e;
font-weight: 700;
cursor: pointer;
}
.button--ghost {
color: #0f766e;
background: transparent;
}
.code {
overflow-x: auto;
padding: 14px;
background: #111827;
color: #f9fafb;
border-radius: 8px;
white-space: pre-wrap;
}
.stats-strip {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
gap: 14px;
margin-top: 16px;
}
.stats-strip__item {
padding: 14px 16px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.stats-strip__label {
display: block;
color: #6b7280;
font-size: 12px;
text-transform: uppercase;
}
.stats-strip__value {
display: block;
margin-top: 6px;
color: #1f2937;
font-weight: 700;
}
.progress-panel {
display: grid;
gap: 12px;
}
.progress-panel__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 16px;
}
.progress-panel__body {
display: grid;
gap: 10px;
}
.progress-panel__meta {
display: flex;
flex-wrap: wrap;
gap: 12px;
color: #4b5563;
font-size: 14px;
}
.progress-panel__percent {
color: #0f766e;
font-weight: 700;
}
.progress-panel__empty {
margin: 0;
color: #6b7280;
}
.progress-bar {
height: 12px;
overflow: hidden;
background: #e5e7eb;
border-radius: 999px;
}
.progress-bar__fill {
height: 100%;
width: 0;
background: #0f766e;
transition: width 0.25s ease;
}
.directory {
display: grid;
gap: 18px;
}
.directory__header {
display: flex;
align-items: end;
justify-content: space-between;
gap: 16px;
}
.directory__title {
margin: 0;
font-size: 24px;
}
.directory__summary {
margin: 6px 0 0;
color: #6b7280;
}
.directory__filters {
display: grid;
grid-template-columns: minmax(220px, 1.7fr) repeat(6, minmax(120px, 1fr));
gap: 10px;
padding: 16px;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.directory__input {
min-width: 0;
padding: 10px 12px;
border: 1px solid #cbd5e1;
border-radius: 6px;
}
.directory__table-wrap {
overflow-x: auto;
background: #ffffff;
border: 1px solid #d9dee7;
border-radius: 8px;
}
.directory__pagination {
display: flex;
align-items: center;
justify-content: center;
gap: 16px;
}
.directory__page {
color: #4b5563;
font-weight: 700;
}
.directory-table {
width: 100%;
min-width: 1120px;
border-collapse: collapse;
}
.directory-table__head {
padding: 12px 10px;
color: #374151;
background: #f9fafb;
border-bottom: 1px solid #e5e7eb;
font-size: 13px;
text-align: left;
white-space: nowrap;
}
.directory-table__cell {
max-width: 280px;
padding: 12px 10px;
border-bottom: 1px solid #e5e7eb;
vertical-align: top;
}
.directory-table__row {
cursor: pointer;
}
.directory-table__row:hover {
background: #f0fdfa;
}
.directory-table__empty {
padding: 28px;
color: #6b7280;
text-align: center;
}
.directory-table__cell--hidden,
.directory-table__head--hidden {
display: none;
}
.columns-modal {
position: fixed;
inset: 0;
z-index: 50;
display: grid;
place-items: center;
padding: 20px;
}
.columns-modal[hidden] {
display: none;
}
.columns-modal__backdrop {
position: absolute;
inset: 0;
background: rgba(17, 24, 39, 0.54);
}
.columns-modal__panel {
position: relative;
width: min(620px, 100%);
max-height: min(720px, calc(100vh - 40px));
overflow: auto;
padding: 20px;
background: #ffffff;
border-radius: 8px;
box-shadow: 0 24px 80px rgba(15, 23, 42, 0.22);
}
.columns-modal__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
}
.columns-modal__title {
margin: 0;
font-size: 18px;
}
.columns-modal__grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 10px;
margin-top: 18px;
}
.columns-modal__option {
display: flex;
align-items: center;
gap: 8px;
padding: 10px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 6px;
}
.columns-modal__checkbox {
width: 16px;
height: 16px;
}
@media (max-width: 920px) {
.directory__filters {
grid-template-columns: 1fr 1fr;
}
.progress-panel__header,
.directory__header {
align-items: stretch;
flex-direction: column;
}
}
@media (max-width: 620px) {
.directory__filters {
grid-template-columns: 1fr;
}
}

111
app/static/admin.js Normal file
View File

@@ -0,0 +1,111 @@
(function () {
const columnDefaults = [
"full_name",
"status",
"positions",
"hse_start_year",
"email",
"last_seen_at",
"dismissed_at",
"profile",
];
const storageKey = "miem.directory.columns";
function readColumns() {
try {
const stored = JSON.parse(localStorage.getItem(storageKey) || "[]");
return Array.isArray(stored) && stored.length ? stored : columnDefaults;
} catch (_error) {
return columnDefaults;
}
}
function writeColumns(columns) {
localStorage.setItem(storageKey, JSON.stringify(columns));
}
function applyColumns(columns) {
document.querySelectorAll("[data-column]").forEach((node) => {
const visible = columns.includes(node.dataset.column);
node.classList.toggle("directory-table__cell--hidden", !visible && node.classList.contains("directory-table__cell"));
node.classList.toggle("directory-table__head--hidden", !visible && node.classList.contains("directory-table__head"));
});
document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
checkbox.checked = columns.includes(checkbox.value);
});
}
function setupColumns() {
if (!document.querySelector("[data-directory-table]")) return;
let columns = readColumns();
const modal = document.querySelector("[data-columns-modal]");
applyColumns(columns);
document.querySelectorAll("[data-columns-open]").forEach((button) => {
button.addEventListener("click", () => {
modal.hidden = false;
});
});
document.querySelectorAll("[data-columns-close]").forEach((button) => {
button.addEventListener("click", () => {
modal.hidden = true;
});
});
document.querySelectorAll("[data-column-toggle]").forEach((checkbox) => {
checkbox.addEventListener("change", () => {
columns = Array.from(document.querySelectorAll("[data-column-toggle]:checked")).map((item) => item.value);
if (!columns.length) columns = ["full_name"];
writeColumns(columns);
applyColumns(columns);
});
});
document.querySelectorAll("[data-row-href]").forEach((row) => {
row.addEventListener("click", (event) => {
if (event.target.closest("a, button, input, select, label")) return;
window.location.href = row.dataset.rowHref;
});
});
}
function setupProgress() {
const panel = document.querySelector("[data-progress-panel]");
if (!panel) return;
const update = (run) => {
if (!run) return;
const status = document.querySelector("[data-progress-status]");
const processed = document.querySelector("[data-progress-processed]");
const found = document.querySelector("[data-progress-found]");
const errors = document.querySelector("[data-progress-errors]");
const fill = document.querySelector("[data-progress-fill]");
const percent = document.querySelector("[data-progress-percent]");
if (status) status.textContent = run.status;
if (processed) processed.textContent = run.processed_count;
if (found) found.textContent = run.found_count;
if (errors) errors.textContent = run.error_count;
if (fill) fill.style.width = `${run.progress_percent}%`;
if (percent) percent.textContent = run.progress_percent;
};
const poll = async () => {
try {
const response = await fetch("/api/crawl-runs/latest", { credentials: "same-origin" });
if (!response.ok) return false;
const data = await response.json();
const run = data.running || data.latest;
update(run);
return Boolean(data.running);
} catch (_error) {
return false;
}
};
const interval = window.setInterval(async () => {
const keepGoing = await poll();
if (!keepGoing) window.clearInterval(interval);
}, 4000);
}
setupColumns();
setupProgress();
})();

30
app/templates/base.html Normal file
View File

@@ -0,0 +1,30 @@
<!doctype html>
<html lang="ru">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{% block title %}MIEM Employees{% endblock %}</title>
<link rel="stylesheet" href="/static/admin.css">
</head>
<body class="admin">
<header class="admin__header">
<h1 class="admin__brand">MIEM Employees</h1>
<nav class="admin__nav">
<a class="admin__link" href="/admin">Dashboard</a>
<a class="admin__link" href="/admin/directory">Directory</a>
<a class="admin__link" href="/admin/employees">Employees</a>
<a class="admin__link" href="/admin/runs">Runs</a>
<form method="post" action="/admin/logout">
<button class="button button--ghost" type="submit">Logout</button>
</form>
</nav>
</header>
<main class="admin__main">
{% block content %}{% endblock %}
</main>
<footer class="admin__footer">
Backend {{ backend_version }} · Frontend {{ frontend_version }}
</footer>
{% block scripts %}{% endblock %}
</body>
</html>

View File

@@ -0,0 +1,62 @@
{% extends "base.html" %}
{% block title %}Dashboard · MIEM Employees{% endblock %}
{% block content %}
<section class="admin__grid">
<div class="metric"><div class="metric__label">Total</div><div class="metric__value">{{ counts.total }}</div></div>
<div class="metric"><div class="metric__label">Active</div><div class="metric__value">{{ counts.active }}</div></div>
<div class="metric"><div class="metric__label">New in last run</div><div class="metric__value">{{ counts.new_in_last_run }}</div></div>
<div class="metric"><div class="metric__label">Dismissed</div><div class="metric__value">{{ counts.dismissed }}</div></div>
</section>
<section class="stats-strip">
<div class="stats-strip__item">
<span class="stats-strip__label">Latest added</span>
{% if counts.latest_added %}
<a class="stats-strip__value" href="/admin/employees/{{ counts.latest_added.id }}">{{ counts.latest_added.full_name or counts.latest_added.canonical_url }}</a>
{% else %}
<span class="stats-strip__value">No employees yet</span>
{% endif %}
</div>
<div class="stats-strip__item">
<span class="stats-strip__label">Runs</span>
<span class="stats-strip__value">{{ counts.runs }}</span>
</div>
<div class="stats-strip__item">
<span class="stats-strip__label">Errors</span>
<span class="stats-strip__value">{{ counts.errors }}</span>
</div>
</section>
<section class="panel progress-panel" data-progress-panel>
<div class="progress-panel__header">
<h2 class="panel__title">Parsing progress</h2>
<form method="post" action="/admin/crawl-now">
<button class="button" type="submit">Start crawl now</button>
</form>
</div>
{% set run = counts.current_running_run or latest_run %}
<div class="progress-panel__body" data-progress-body>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status if run else "idle" }}</span>
<span><span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span> processed</span>
<span><span data-progress-errors>{{ run.error_count if run else 0 }}</span> errors</span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ run.progress_percent if run else 0 }}%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>{{ run.progress_percent if run else 0 }}</span>%</div>
</div>
</section>
<section class="panel">
<h2 class="panel__title">Latest runs</h2>
<table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Parsed</th><th class="table__head">Errors</th><th class="table__head">Started</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_at }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -0,0 +1,112 @@
{% extends "base.html" %}
{% block title %}Directory · MIEM Employees{% endblock %}
{% block content %}
<section class="directory">
<div class="directory__header">
<div>
<h2 class="directory__title">Directory</h2>
<p class="directory__summary">{{ page.total }} employees found</p>
</div>
<button class="button" type="button" data-columns-open>Columns</button>
</div>
<form class="directory__filters" method="get" action="/admin/directory">
<input class="directory__input" name="q" value="{{ filters.q }}" placeholder="Name or URL">
<select class="directory__input" name="status">
<option value="" {% if not filters.status %}selected{% endif %}>All statuses</option>
<option value="active" {% if filters.status == "active" %}selected{% endif %}>Active</option>
<option value="dismissed" {% if filters.status == "dismissed" %}selected{% endif %}>Dismissed</option>
</select>
<select class="directory__input" name="has_email">
<option value="" {% if not filters.has_email %}selected{% endif %}>Any email</option>
<option value="true" {% if filters.has_email == "true" %}selected{% endif %}>Has email</option>
<option value="false" {% if filters.has_email == "false" %}selected{% endif %}>No email</option>
</select>
<input class="directory__input" type="date" name="started_from" value="{{ filters.started_from }}" aria-label="First seen from">
<input class="directory__input" type="date" name="started_to" value="{{ filters.started_to }}" aria-label="First seen to">
<select class="directory__input" name="sort">
{% for value, label in [("full_name", "Name"), ("status", "Status"), ("hse_start_year", "HSE start"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed")] %}
<option value="{{ value }}" {% if filters.sort == value %}selected{% endif %}>Sort: {{ label }}</option>
{% endfor %}
</select>
<select class="directory__input" name="direction">
<option value="asc" {% if filters.direction == "asc" %}selected{% endif %}>Ascending</option>
<option value="desc" {% if filters.direction == "desc" %}selected{% endif %}>Descending</option>
</select>
<button class="button" type="submit">Apply</button>
</form>
<div class="directory__table-wrap">
<table class="directory-table" data-directory-table>
<thead>
<tr>
<th class="directory-table__head" data-column="full_name">Name</th>
<th class="directory-table__head" data-column="status">Status</th>
<th class="directory-table__head" data-column="positions">Positions</th>
<th class="directory-table__head" data-column="hse_start_year">HSE start</th>
<th class="directory-table__head" data-column="email">Email</th>
<th class="directory-table__head" data-column="phone">Phone</th>
<th class="directory-table__head" data-column="address">Address</th>
<th class="directory-table__head" data-column="publications_count">Publications</th>
<th class="directory-table__head" data-column="courses_count">Courses</th>
<th class="directory-table__head" data-column="first_seen_at">First seen</th>
<th class="directory-table__head" data-column="last_seen_at">Last seen</th>
<th class="directory-table__head" data-column="dismissed_at">Dismissed</th>
<th class="directory-table__head" data-column="profile">Profile</th>
</tr>
</thead>
<tbody>
{% for employee in page.items %}
<tr class="directory-table__row" data-row-href="/admin/employees/{{ employee.id }}">
<td class="directory-table__cell" data-column="full_name">{{ employee.full_name or "No name" }}</td>
<td class="directory-table__cell" data-column="status"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
<td class="directory-table__cell" data-column="positions">{{ employee.positions_text }}</td>
<td class="directory-table__cell" data-column="hse_start_year">{{ employee.hse_start_year or "" }}</td>
<td class="directory-table__cell" data-column="email">{{ employee.email_text }}</td>
<td class="directory-table__cell" data-column="phone">{{ employee.phone_text }}</td>
<td class="directory-table__cell" data-column="address">{{ employee.address or "" }}</td>
<td class="directory-table__cell" data-column="publications_count">{{ employee.publications_count }}</td>
<td class="directory-table__cell" data-column="courses_count">{{ employee.courses_count }}</td>
<td class="directory-table__cell" data-column="first_seen_at">{{ employee.first_seen_at or "" }}</td>
<td class="directory-table__cell" data-column="last_seen_at">{{ employee.last_seen_at or "" }}</td>
<td class="directory-table__cell" data-column="dismissed_at">{{ employee.dismissed_at or "" }}</td>
<td class="directory-table__cell" data-column="profile"><a class="admin__link" href="{{ employee.canonical_url }}">Open</a></td>
</tr>
{% else %}
<tr><td class="directory-table__empty" colspan="13">No employees match these filters.</td></tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="directory__pagination">
{% set prev_offset = filters.offset - filters.limit %}
{% set next_offset = filters.offset + filters.limit %}
{% if filters.offset > 0 %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=prev_offset) }}">Previous</a>
{% endif %}
<span class="directory__page">Page {{ page.page }}{% if page.pages %} of {{ page.pages }}{% endif %}</span>
{% if next_offset < page.total %}
<a class="admin__link" href="{{ request.url.include_query_params(offset=next_offset) }}">Next</a>
{% endif %}
</div>
</section>
<div class="columns-modal" data-columns-modal hidden>
<div class="columns-modal__backdrop" data-columns-close></div>
<section class="columns-modal__panel" aria-label="Column settings">
<div class="columns-modal__header">
<h3 class="columns-modal__title">Visible columns</h3>
<button class="button button--ghost" type="button" data-columns-close>Close</button>
</div>
<div class="columns-modal__grid">
{% for key, label in [("full_name", "Name"), ("status", "Status"), ("positions", "Positions"), ("hse_start_year", "HSE start"), ("email", "Email"), ("phone", "Phone"), ("address", "Address"), ("publications_count", "Publications"), ("courses_count", "Courses"), ("first_seen_at", "First seen"), ("last_seen_at", "Last seen"), ("dismissed_at", "Dismissed"), ("profile", "Profile")] %}
<label class="columns-modal__option"><input class="columns-modal__checkbox" type="checkbox" value="{{ key }}" data-column-toggle> {{ label }}</label>
{% endfor %}
</div>
</section>
</div>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

View File

@@ -0,0 +1,28 @@
{% extends "base.html" %}
{% block title %}{{ employee.full_name }} · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">{{ employee.full_name or employee.profile_key }}</h2>
<p><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></p>
<p><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></p>
<h3>Tabs</h3>
<ul>
{% for tab in employee.tabs %}
<li><a class="admin__link" href="{{ tab.href }}">{{ tab.title }}</a></li>
{% endfor %}
</ul>
<h3>Current data</h3>
<pre class="code">{{ employee.current_data | tojson(indent=2) }}</pre>
</section>
<section class="panel">
<h2 class="panel__title">Snapshots</h2>
<table class="table">
<thead><tr><th class="table__head">Captured</th><th class="table__head">Checksum</th><th class="table__head">Parser</th></tr></thead>
<tbody>
{% for snapshot in snapshots %}
<tr><td class="table__cell">{{ snapshot.captured_at }}</td><td class="table__cell">{{ snapshot.checksum }}</td><td class="table__cell">{{ snapshot.parser_version }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}

View File

@@ -0,0 +1,29 @@
{% extends "base.html" %}
{% block title %}Employees · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<h2 class="panel__title">Employees</h2>
<form class="form" method="get" action="/admin/employees">
<input class="form__input" name="q" value="{{ q }}" placeholder="Name or URL">
<select class="form__select" name="status">
<option value="" {% if not status %}selected{% endif %}>All</option>
<option value="active" {% if status == "active" %}selected{% endif %}>Active</option>
<option value="dismissed" {% if status == "dismissed" %}selected{% endif %}>Dismissed</option>
</select>
<button class="button" type="submit">Search</button>
</form>
<table class="table">
<thead><tr><th class="table__head">Name</th><th class="table__head">Status</th><th class="table__head">Last seen</th><th class="table__head">Profile</th></tr></thead>
<tbody>
{% for employee in employees %}
<tr>
<td class="table__cell"><a class="admin__link" href="/admin/employees/{{ employee.id }}">{{ employee.full_name or employee.profile_key }}</a></td>
<td class="table__cell"><span class="badge {% if employee.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee.status }}</span></td>
<td class="table__cell">{{ employee.last_seen_at }}</td>
<td class="table__cell"><a class="admin__link" href="{{ employee.canonical_url }}">{{ employee.canonical_url }}</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}

25
app/templates/login.html Normal file
View File

@@ -0,0 +1,25 @@
<!doctype html>
<html lang="ru">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Login · MIEM Employees</title>
<link rel="stylesheet" href="/static/admin.css">
</head>
<body class="admin">
<main class="admin__main">
<section class="panel">
<h1 class="panel__title">Admin login</h1>
{% if error %}<p>{{ error }}</p>{% endif %}
<form class="form" method="post" action="/admin/login">
<label class="form__label">Login <input class="form__input" name="username" autocomplete="username"></label>
<label class="form__label">Password <input class="form__input" name="password" type="password" autocomplete="current-password"></label>
<button class="button" type="submit">Sign in</button>
</form>
</section>
</main>
<footer class="admin__footer">
Backend {{ backend_version }} · Frontend {{ frontend_version }}
</footer>
</body>
</html>

60
app/templates/runs.html Normal file
View File

@@ -0,0 +1,60 @@
{% extends "base.html" %}
{% block title %}Runs · MIEM Employees{% endblock %}
{% block content %}
<section class="panel">
<div class="progress-panel__header">
<h2 class="panel__title">Crawl runs</h2>
<form method="post" action="/admin/runs"><button class="button" type="submit">Start crawl now</button></form>
</div>
{% set run = runs[0] if runs else none %}
{% if run %}
{% set processed = run.parsed_count + run.error_count %}
{% set percent = ((processed / run.found_count) * 100) | round(1) if run.found_count else 0 %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>{{ run.status }}</span>
<span><span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span> processed</span>
<span><span data-progress-errors>{{ run.error_count }}</span> errors</span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: {{ percent }}%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>{{ percent }}</span>%</div>
</div>
{% else %}
<div class="progress-panel" data-progress-panel>
<div class="progress-panel__meta">
<span data-progress-status>idle</span>
<span><span data-progress-processed>0</span> / <span data-progress-found>0</span> processed</span>
<span><span data-progress-errors>0</span> errors</span>
</div>
<div class="progress-bar" aria-label="Parsing progress">
<div class="progress-bar__fill" data-progress-fill style="width: 0%"></div>
</div>
<div class="progress-panel__percent"><span data-progress-percent>0</span>%</div>
</div>
{% endif %}
<table class="table">
<thead><tr><th class="table__head">ID</th><th class="table__head">Status</th><th class="table__head">Found</th><th class="table__head">Parsed</th><th class="table__head">New</th><th class="table__head">Errors</th><th class="table__head">Dismissed</th></tr></thead>
<tbody>
{% for run in runs %}
<tr><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
<section class="panel">
<h2 class="panel__title">Recent errors</h2>
<table class="table">
<thead><tr><th class="table__head">Run</th><th class="table__head">Profile</th><th class="table__head">Error</th></tr></thead>
<tbody>
{% for error in errors %}
<tr><td class="table__cell">{{ error.crawl_run_id }}</td><td class="table__cell">{{ error.profile_url }}</td><td class="table__cell">{{ error.error_type }}: {{ error.message }}</td></tr>
{% endfor %}
</tbody>
</table>
</section>
{% endblock %}
{% block scripts %}
<script src="/static/admin.js"></script>
{% endblock %}

3
app/version.py Normal file
View File

@@ -0,0 +1,3 @@
APP_VERSION = "0.2.0"
FRONTEND_VERSION = "0.2.0"
BACKEND_VERSION = "0.2.0"

45
app/worker.py Normal file
View File

@@ -0,0 +1,45 @@
import logging
import signal
import time
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from app.config import get_settings
from app.db import SessionLocal, init_db
from app.services.crawler import run_crawl
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
def crawl_once() -> None:
settings = get_settings()
with SessionLocal() as db:
run = run_crawl(db, settings)
logger.info("crawl finished: id=%s status=%s parsed=%s errors=%s", run.id, run.status, run.parsed_count, run.error_count)
def main() -> None:
init_db()
settings = get_settings()
scheduler = BackgroundScheduler(timezone="Europe/Moscow")
scheduler.add_job(crawl_once, CronTrigger.from_crontab(settings.crawl_cron), id="weekly_miem_crawl", replace_existing=True)
scheduler.start()
logger.info("worker started with cron=%s", settings.crawl_cron)
stop = False
def _stop(*_: object) -> None:
nonlocal stop
stop = True
signal.signal(signal.SIGTERM, _stop)
signal.signal(signal.SIGINT, _stop)
while not stop:
time.sleep(1)
scheduler.shutdown()
if __name__ == "__main__":
main()

75
migrations/001_init.sql Normal file
View File

@@ -0,0 +1,75 @@
CREATE TABLE IF NOT EXISTS parser_sources (
id SERIAL PRIMARY KEY,
source_url TEXT NOT NULL UNIQUE,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
source_url TEXT NOT NULL,
status VARCHAR(32) NOT NULL DEFAULT 'running',
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
finished_at TIMESTAMPTZ,
found_count INTEGER NOT NULL DEFAULT 0,
parsed_count INTEGER NOT NULL DEFAULT 0,
new_count INTEGER NOT NULL DEFAULT 0,
error_count INTEGER NOT NULL DEFAULT 0,
dismissed_count INTEGER NOT NULL DEFAULT 0,
message TEXT
);
CREATE TABLE IF NOT EXISTS employees (
id SERIAL PRIMARY KEY,
profile_key VARCHAR(255) NOT NULL UNIQUE,
profile_type VARCHAR(50),
profile_id VARCHAR(255),
canonical_url TEXT NOT NULL,
full_name TEXT,
status VARCHAR(32) NOT NULL DEFAULT 'active',
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
dismissed_at TIMESTAMPTZ,
parser_version VARCHAR(32),
current_data JSONB,
current_checksum VARCHAR(64),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS ix_employees_full_name ON employees (full_name);
CREATE INDEX IF NOT EXISTS ix_employees_status ON employees (status);
CREATE TABLE IF NOT EXISTS employee_snapshots (
id SERIAL PRIMARY KEY,
employee_id INTEGER NOT NULL REFERENCES employees(id),
crawl_run_id INTEGER REFERENCES crawl_runs(id),
parsed_data JSONB NOT NULL,
html_snapshot BYTEA,
checksum VARCHAR(64) NOT NULL,
parser_version VARCHAR(32),
captured_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS ix_employee_snapshots_employee_id ON employee_snapshots (employee_id);
CREATE TABLE IF NOT EXISTS crawl_errors (
id SERIAL PRIMARY KEY,
crawl_run_id INTEGER NOT NULL REFERENCES crawl_runs(id),
profile_url TEXT,
error_type VARCHAR(255) NOT NULL,
message TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS ix_crawl_errors_run_id ON crawl_errors (crawl_run_id);
CREATE TABLE IF NOT EXISTS profile_tabs (
id SERIAL PRIMARY KEY,
employee_id INTEGER NOT NULL REFERENCES employees(id),
title TEXT NOT NULL,
href TEXT NOT NULL,
data_index VARCHAR(64)
);
CREATE INDEX IF NOT EXISTS ix_profile_tabs_employee_id ON profile_tabs (employee_id);

View File

@@ -0,0 +1,2 @@
ALTER TABLE crawl_runs
ADD COLUMN IF NOT EXISTS new_count INTEGER NOT NULL DEFAULT 0;

23
tests/conftest.py Normal file
View File

@@ -0,0 +1,23 @@
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool
from app.db import Base
@pytest.fixture()
def db_session():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
try:
yield session
finally:
session.close()
Base.metadata.drop_all(engine)

98
tests/test_admin_data.py Normal file
View File

@@ -0,0 +1,98 @@
from datetime import datetime, timezone
from app.models import CrawlRun, Employee
from app.services.admin_data import employee_display_payload, list_employees_page, run_payload, stats_payload
def test_employee_display_payload_extracts_common_fields(db_session):
employee = Employee(
profile_key="staff:person",
canonical_url="https://www.hse.ru/staff/person",
full_name="Person Name",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={
"positions": ["Professor"],
"hse_start_year": 2024,
"contacts": {"emails": ["person@hse.ru"], "phones": ["+79990000000"], "address": "Moscow"},
"sections": [
{"type": "publications", "publications": [{"title": "Paper"}]},
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
],
},
)
payload = employee_display_payload(employee)
assert payload["positions_text"] == "Professor"
assert payload["email_text"] == "person@hse.ru"
assert payload["publications_count"] == 1
assert payload["courses_count"] == 1
def test_list_employees_page_filters_sorts_and_paginates(db_session):
db_session.add(
Employee(
profile_key="staff:b",
canonical_url="https://www.hse.ru/staff/b",
full_name="Beta",
status="dismissed",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": []}},
)
)
db_session.add(
Employee(
profile_key="staff:a",
canonical_url="https://www.hse.ru/staff/a",
full_name="Alpha",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": ["alpha@hse.ru"]}},
)
)
db_session.commit()
page = list_employees_page(db_session, status="active", sort="full_name", direction="asc", limit=10)
assert page["total"] == 1
assert page["items"][0]["full_name"] == "Alpha"
def test_stats_payload_uses_latest_run_new_count(db_session):
db_session.add(
Employee(
profile_key="staff:a",
canonical_url="https://www.hse.ru/staff/a",
full_name="Alpha",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=3))
db_session.commit()
payload = stats_payload(db_session)
assert payload["total"] == 1
assert payload["active"] == 1
assert payload["new_in_last_run"] == 3
def test_run_payload_calculates_progress():
run = CrawlRun(
source_url="https://miem.hse.ru/persons",
status="running",
found_count=10,
parsed_count=4,
error_count=1,
)
payload = run_payload(run)
assert payload["processed_count"] == 5
assert payload["progress_percent"] == 50.0

159
tests/test_api_mcp.py Normal file
View File

@@ -0,0 +1,159 @@
from datetime import datetime, timezone
from fastapi.testclient import TestClient
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool
from app.config import Settings, get_settings
from app.db import Base, get_db
from app.main import app
from app.models import CrawlRun, Employee
from app.security import SESSION_COOKIE, sign_session
def test_health_returns_versions():
client = TestClient(app)
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.2.0"
def test_mcp_requires_token_and_lists_tools():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def override_db():
session = Session()
try:
yield session
finally:
session.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
client = TestClient(app)
unauthorized = client.post("/mcp", json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})
authorized = client.post(
"/mcp",
headers={"Authorization": "Bearer secret"},
json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
)
assert unauthorized.status_code == 401
assert authorized.status_code == 200
assert authorized.json()["result"]["tools"][0]["name"] == "search_employees"
app.dependency_overrides.clear()
def test_mcp_search_employees_returns_matching_employee():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(
Employee(
profile_key="staff:avsergeev",
profile_type="staff",
profile_id="avsergeev",
canonical_url="https://www.hse.ru/staff/avsergeev",
full_name="Сергеев Алексей Викторович",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"sections": []},
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
client = TestClient(app)
response = client.post(
"/mcp",
headers={"Authorization": "Bearer secret"},
json={
"jsonrpc": "2.0",
"id": 1,
"method": "tools/call",
"params": {"name": "search_employees", "arguments": {"query": "Сергеев"}},
},
)
assert response.status_code == 200
assert "Сергеев Алексей Викторович" in response.json()["result"]["content"][0]["text"]
app.dependency_overrides.clear()
def test_api_employees_and_stats_require_admin_session():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()
db.add(
Employee(
profile_key="staff:alpha",
profile_type="staff",
profile_id="alpha",
canonical_url="https://www.hse.ru/staff/alpha",
full_name="Alpha Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"contacts": {"emails": ["alpha@hse.ru"]}, "sections": []},
)
)
db.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1))
db.commit()
db.close()
settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
def override_db():
session = Session()
try:
yield session
finally:
session.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: settings
client = TestClient(app)
client.cookies.set(SESSION_COOKIE, sign_session("admin", settings))
employees = client.get("/api/employees", params={"q": "Alpha", "has_email": True})
stats = client.get("/api/stats")
assert employees.status_code == 200
assert employees.json()["total"] == 1
assert stats.status_code == 200
assert stats.json()["new_in_last_run"] == 1
app.dependency_overrides.clear()

58
tests/test_crawler.py Normal file
View File

@@ -0,0 +1,58 @@
from datetime import datetime, timezone
from app.models import CrawlRun, Employee
from app.services.crawler import _mark_dismissed, _upsert_employee
def test_mark_dismissed_only_marks_missing_active_employees(db_session):
db_session.add(
Employee(
profile_key="staff:kept",
canonical_url="https://www.hse.ru/staff/kept",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.add(
Employee(
profile_key="staff:gone",
canonical_url="https://www.hse.ru/staff/gone",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.commit()
dismissed = _mark_dismissed(db_session, {"staff:kept"})
assert dismissed == 1
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
assert gone.status == "dismissed"
assert gone.dismissed_at is not None
def test_upsert_employee_increments_new_count_for_new_employee(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
_upsert_employee(
db_session,
run,
{
"source_url": "https://www.hse.ru/staff/newperson",
"profile_type": "staff",
"profile_id": "newperson",
"full_name": "New Person",
"tabs": [],
"sections": [],
"parser_version": "0.2.0",
"_html": "<html></html>",
},
)
db_session.commit()
assert run.new_count == 1

28
tests/test_parser.py Normal file
View File

@@ -0,0 +1,28 @@
from bs4 import BeautifulSoup
from app.parser.profile import extract_person_tabs
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
def test_normalize_profile_url_supports_staff_and_org_persons():
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
assert parse_profile_identity("https://www.hse.ru/staff/avsergeev") == ("staff", "avsergeev")
def test_extract_person_tabs_prefers_person_menu_addition():
soup = BeautifulSoup(
"""
<div class="person-menu is-desktop small person-menu-addition">
<a href="#main">Домашняя страница</a>
<a href="#sci" data-index="1">Публикации</a>
</div>
<a href="/org/persons/999">Other person</a>
""",
"html.parser",
)
tabs = extract_person_tabs(soup, "https://www.hse.ru/staff/avsergeev")
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"