diff --git a/app/admin.py b/app/admin.py index d0c8465..0d7c734 100644 --- a/app/admin.py +++ b/app/admin.py @@ -17,6 +17,7 @@ from app.services.admin_data import ( stats_payload, ) from app.services.crawl_control import get_running_run, run_crawl_if_idle +from app.services.crawler import refresh_employee from app.version import BACKEND_VERSION, FRONTEND_VERSION router = APIRouter(prefix="/admin") @@ -144,10 +145,31 @@ def employee_detail( return _render( request, "employee_detail.html", - {"employee": employee, "employee_view": employee_detail_payload(employee), "snapshots": snapshots}, + { + "employee": employee, + "employee_view": employee_detail_payload(employee), + "snapshots": snapshots, + "refresh_status": request.query_params.get("refresh_status"), + }, ) +@router.post("/employees/{employee_id}/refresh") +def refresh_employee_detail( + employee_id: int, + request: Request, + db: Session = Depends(get_db), + settings: Settings = Depends(get_settings), +): + require_admin(request, settings) + employee = db.get(Employee, employee_id) + if not employee: + return RedirectResponse("/admin/directory", status_code=303) + run = refresh_employee(db, employee, settings) + status = "success" if run.status == "completed" else "error" + return RedirectResponse(f"/admin/employees/{employee_id}?refresh_status={status}", status_code=303) + + @router.get("/runs", response_class=HTMLResponse) def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)): require_admin(request, settings) diff --git a/app/parser/profile.py b/app/parser/profile.py index 84c62fa..8365a96 100644 --- a/app/parser/profile.py +++ b/app/parser/profile.py @@ -387,7 +387,7 @@ def _infer_section_type(title: str, nodes: list) -> str: lowered = title.lower() if _has_table(nodes): return "table" - if "публикац" in lowered: + if _is_publications_title(lowered): return "publications" if "учебные курсы" in lowered: return "courses_by_year" @@ -398,6 +398,10 @@ def _infer_section_type(title: str, nodes: list) -> str: return "generic" +def _is_publications_title(lowered_title: str) -> bool: + return lowered_title.startswith("публикац") + + def _has_table(nodes: list) -> bool: return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes) diff --git a/app/services/crawler.py b/app/services/crawler.py index 840c5e7..5048ab7 100644 --- a/app/services/crawler.py +++ b/app/services/crawler.py @@ -80,6 +80,48 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun: return run +def refresh_employee(db: Session, employee: Employee, settings: Settings) -> CrawlRun: + run = CrawlRun(source_url=employee.canonical_url, status="running", found_count=1) + db.add(run) + db.commit() + db.refresh(run) + + try: + with requests.Session() as session: + parsed = parse_person_profile( + session, + employee.canonical_url, + HEADERS, + settings.request_timeout, + settings.parser_use_playwright, + ) + if not parsed: + raise ValueError("Профиль не удалось распарсить.") + if _parsed_profile_key(parsed) != employee.profile_key: + raise ValueError("Распарсенный профиль не совпадает с обновляемым сотрудником.") + + _upsert_employee(db, run, parsed) + run.parsed_count = 1 + run.status = "completed" + except Exception as exc: + run.status = "failed" + run.error_count = 1 + run.message = str(exc) + db.add( + CrawlError( + crawl_run_id=run.id, + profile_url=employee.canonical_url, + error_type=type(exc).__name__, + message=str(exc), + ) + ) + finally: + run.finished_at = datetime.now(timezone.utc) + db.commit() + db.refresh(run) + return run + + def _ensure_source(db: Session, source_url: str) -> ParserSource: source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url)) if source: @@ -91,10 +133,14 @@ def _ensure_source(db: Session, source_url: str) -> ParserSource: return source +def _parsed_profile_key(parsed: dict) -> str: + return f"{parsed.get('profile_type')}:{parsed.get('profile_id')}" + + def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee: html = parsed.pop("_html", None) checksum = _checksum(parsed) - key = f"{parsed.get('profile_type')}:{parsed.get('profile_id')}" + key = _parsed_profile_key(parsed) employee = db.scalar(select(Employee).where(Employee.profile_key == key)) now = datetime.now(timezone.utc) if not employee: diff --git a/app/static/admin.css b/app/static/admin.css index e213ba5..0a0f4b3 100644 --- a/app/static/admin.css +++ b/app/static/admin.css @@ -171,6 +171,10 @@ background: transparent; } +.button--compact { + padding: 8px 12px; +} + .code { overflow-x: auto; padding: 14px; @@ -201,11 +205,34 @@ gap: 10px; } +.employee-card__actions { + display: grid; + justify-items: end; + gap: 10px; +} + .employee-card__title { margin: 0; font-size: 24px; } +.employee-card__notice { + margin: 0; + padding: 12px 14px; + border-radius: 8px; + font-weight: 700; +} + +.employee-card__notice--success { + color: #065f46; + background: #d1fae5; +} + +.employee-card__notice--error { + color: #991b1b; + background: #fee2e2; +} + .employee-card__section { padding: 20px; background: #ffffff; diff --git a/app/templates/employee_detail.html b/app/templates/employee_detail.html index d544d2f..9809566 100644 --- a/app/templates/employee_detail.html +++ b/app/templates/employee_detail.html @@ -7,8 +7,18 @@

{{ employee_view.full_name or employee.profile_key }}

{{ employee_view.status_display }} - {{ employee_view.canonical_url }} +
+
+ +
+ {{ employee_view.canonical_url }} +
+ {% if refresh_status == "success" %} +

Данные сотрудника обновлены.

+ {% elif refresh_status == "error" %} +

Не удалось обновить данные сотрудника.

+ {% endif %}

Основная информация

diff --git a/app/version.py b/app/version.py index 0ecc9df..0c07749 100644 --- a/app/version.py +++ b/app/version.py @@ -1,3 +1,3 @@ -APP_VERSION = "0.4.6" -FRONTEND_VERSION = "0.4.6" -BACKEND_VERSION = "0.4.6" +APP_VERSION = "0.4.7" +FRONTEND_VERSION = "0.4.7" +BACKEND_VERSION = "0.4.7" diff --git a/pyproject.toml b/pyproject.toml index 0f81068..19e849e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "miem-workers" -version = "0.4.6" +version = "0.4.7" description = "MIEM employees parser, admin API, and MCP server" requires-python = ">=3.11" dependencies = [ diff --git a/tests/test_api_mcp.py b/tests/test_api_mcp.py index 90fbc6d..4146134 100644 --- a/tests/test_api_mcp.py +++ b/tests/test_api_mcp.py @@ -1,7 +1,8 @@ from datetime import datetime, timezone +from types import SimpleNamespace from fastapi.testclient import TestClient -from sqlalchemy import create_engine +from sqlalchemy import create_engine, select from sqlalchemy.orm import sessionmaker from sqlalchemy.pool import StaticPool @@ -18,7 +19,7 @@ def test_health_returns_versions(): response = client.get("/api/health") assert response.status_code == 200 - assert response.json()["backend_version"] == "0.4.6" + assert response.json()["backend_version"] == "0.4.7" def test_mcp_lists_tools_without_auth_and_ignores_auth_header(): @@ -248,3 +249,56 @@ def test_api_employees_and_stats_require_admin_session(): assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person" app.dependency_overrides.clear() + + +def test_admin_refresh_employee_route_updates_only_requested_employee(monkeypatch): + engine = create_engine( + "sqlite:///:memory:", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + Base.metadata.create_all(engine) + Session = sessionmaker(bind=engine) + db = Session() + db.add( + Employee( + profile_key="org_person:133709486", + profile_type="org_person", + profile_id="133709486", + canonical_url="https://www.hse.ru/org/persons/133709486", + full_name="Будков Юрий Алексеевич", + status="active", + ) + ) + db.commit() + employee_id = db.scalar(select(Employee.id)) + db.close() + + settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret") + + def override_db(): + session = Session() + try: + yield session + finally: + session.close() + + calls = [] + + def fake_refresh_employee(db, refreshed_employee, route_settings): + calls.append((refreshed_employee.id, route_settings)) + return SimpleNamespace(status="completed") + + app.dependency_overrides[get_db] = override_db + app.dependency_overrides[get_settings] = lambda: settings + monkeypatch.setattr("app.admin.refresh_employee", fake_refresh_employee) + client = TestClient(app) + client.cookies.set(SESSION_COOKIE, sign_session("admin", settings)) + + response = client.post(f"/admin/employees/{employee_id}/refresh", follow_redirects=False) + + assert response.status_code == 303 + assert response.headers["location"] == f"/admin/employees/{employee_id}?refresh_status=success" + assert calls == [(employee_id, settings)] + + app.dependency_overrides.clear() diff --git a/tests/test_employee_detail_template.py b/tests/test_employee_detail_template.py index 953ce5d..840fd32 100644 --- a/tests/test_employee_detail_template.py +++ b/tests/test_employee_detail_template.py @@ -27,4 +27,6 @@ def test_employee_detail_template_is_human_readable(): assert "Дата увольнения" in template assert "Тип профиля" in template assert "ID профиля" in template + assert "Обновить данные" in template + assert 'action="/admin/employees/{{ employee.id }}/refresh"' in template assert "Снапшоты" in template diff --git a/tests/test_parser.py b/tests/test_parser.py index 2347e12..7adab9c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs +from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs, extract_sections from app.parser.profile_url import normalize_profile_url, parse_profile_identity @@ -184,3 +184,34 @@ def test_enrich_sections_from_hse_widgets_loads_grouped_publications(): assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"] assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790" assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323" + + +def test_news_heading_with_publications_word_does_not_absorb_widget_publications(): + soup = BeautifulSoup( + """ +

Статья профессора МИЭМ вошла в число самых популярных публикаций на портале SpringerLink

+
+

Первоначально статья профессора вышла в российском журнале.

+
+ + """, + "html.parser", + ) + session = FakeSession() + + sections = extract_sections(soup, "https://www.hse.ru/org/persons/133709486") + sections = enrich_sections_from_hse_widgets( + session, + soup, + "https://www.hse.ru/org/persons/133709486", + {"User-Agent": "test"}, + 10, + sections, + ) + + assert sections[0]["type"] == "paragraphs" + assert sections[0]["title"].startswith("Статья профессора") + publications = [section for section in sections if section["type"] == "publications"] + assert len(publications) == 1 + assert publications[0]["title"] == "Публикации и исследования" + assert publications[0]["publications_count"] == 1