Compare commits

...

5 Commits

10 changed files with 305 additions and 14 deletions

View File

@@ -17,6 +17,7 @@ from app.services.admin_data import (
stats_payload, stats_payload,
) )
from app.services.crawl_control import get_running_run, run_crawl_if_idle from app.services.crawl_control import get_running_run, run_crawl_if_idle
from app.services.crawler import refresh_employee
from app.version import BACKEND_VERSION, FRONTEND_VERSION from app.version import BACKEND_VERSION, FRONTEND_VERSION
router = APIRouter(prefix="/admin") router = APIRouter(prefix="/admin")
@@ -144,10 +145,31 @@ def employee_detail(
return _render( return _render(
request, request,
"employee_detail.html", "employee_detail.html",
{"employee": employee, "employee_view": employee_detail_payload(employee), "snapshots": snapshots}, {
"employee": employee,
"employee_view": employee_detail_payload(employee),
"snapshots": snapshots,
"refresh_status": request.query_params.get("refresh_status"),
},
) )
@router.post("/employees/{employee_id}/refresh")
def refresh_employee_detail(
employee_id: int,
request: Request,
db: Session = Depends(get_db),
settings: Settings = Depends(get_settings),
):
require_admin(request, settings)
employee = db.get(Employee, employee_id)
if not employee:
return RedirectResponse("/admin/directory", status_code=303)
run = refresh_employee(db, employee, settings)
status = "success" if run.status == "completed" else "error"
return RedirectResponse(f"/admin/employees/{employee_id}?refresh_status={status}", status_code=303)
@router.get("/runs", response_class=HTMLResponse) @router.get("/runs", response_class=HTMLResponse)
def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)): def runs(request: Request, db: Session = Depends(get_db), settings: Settings = Depends(get_settings)):
require_admin(request, settings) require_admin(request, settings)

View File

@@ -263,10 +263,10 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di
return publications return publications
result = data.get("result") if isinstance(data, dict) else {} result = data.get("result") if isinstance(data, dict) else {}
items = result.get("items") if isinstance(result, dict) else [] items = _extract_publication_items(result)
if not isinstance(items, list) or not items: if not items:
break break
publications.extend(_normalize_publication_item(item) for item in items if isinstance(item, dict)) publications.extend(_normalize_publication_item(item) for item in items)
total = int(result.get("total") or 0) total = int(result.get("total") or 0)
if not result.get("more") and len(publications) >= total: if not result.get("more") and len(publications) >= total:
@@ -275,6 +275,34 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di
return _dedupe_publications(publications) return _dedupe_publications(publications)
def _extract_publication_items(result: object) -> list[dict]:
if not isinstance(result, dict):
return []
return _flatten_publication_items(result.get("items"))
def _flatten_publication_items(value: object) -> list[dict]:
if isinstance(value, list):
return [item for item in value if _is_publication_item(item)]
if not isinstance(value, dict):
return []
nested_items = value.get("items")
if isinstance(nested_items, list):
return [item for item in nested_items if _is_publication_item(item)]
if isinstance(nested_items, dict):
return _flatten_publication_items(nested_items)
publications = []
for child in value.values():
publications.extend(_flatten_publication_items(child))
return publications
def _is_publication_item(value: object) -> bool:
return isinstance(value, dict) and ("id" in value or "title" in value)
def _load_widget_graduation_theses( def _load_widget_graduation_theses(
session: Session, session: Session,
soup: BeautifulSoup, soup: BeautifulSoup,
@@ -359,7 +387,7 @@ def _infer_section_type(title: str, nodes: list) -> str:
lowered = title.lower() lowered = title.lower()
if _has_table(nodes): if _has_table(nodes):
return "table" return "table"
if "публикац" in lowered: if _is_publications_title(lowered):
return "publications" return "publications"
if "учебные курсы" in lowered: if "учебные курсы" in lowered:
return "courses_by_year" return "courses_by_year"
@@ -370,6 +398,10 @@ def _infer_section_type(title: str, nodes: list) -> str:
return "generic" return "generic"
def _is_publications_title(lowered_title: str) -> bool:
return lowered_title.startswith("публикац")
def _has_table(nodes: list) -> bool: def _has_table(nodes: list) -> bool:
return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes) return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes)

View File

@@ -80,6 +80,48 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
return run return run
def refresh_employee(db: Session, employee: Employee, settings: Settings) -> CrawlRun:
run = CrawlRun(source_url=employee.canonical_url, status="running", found_count=1)
db.add(run)
db.commit()
db.refresh(run)
try:
with requests.Session() as session:
parsed = parse_person_profile(
session,
employee.canonical_url,
HEADERS,
settings.request_timeout,
settings.parser_use_playwright,
)
if not parsed:
raise ValueError("Профиль не удалось распарсить.")
if _parsed_profile_key(parsed) != employee.profile_key:
raise ValueError("Распарсенный профиль не совпадает с обновляемым сотрудником.")
_upsert_employee(db, run, parsed)
run.parsed_count = 1
run.status = "completed"
except Exception as exc:
run.status = "failed"
run.error_count = 1
run.message = str(exc)
db.add(
CrawlError(
crawl_run_id=run.id,
profile_url=employee.canonical_url,
error_type=type(exc).__name__,
message=str(exc),
)
)
finally:
run.finished_at = datetime.now(timezone.utc)
db.commit()
db.refresh(run)
return run
def _ensure_source(db: Session, source_url: str) -> ParserSource: def _ensure_source(db: Session, source_url: str) -> ParserSource:
source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url)) source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url))
if source: if source:
@@ -91,10 +133,14 @@ def _ensure_source(db: Session, source_url: str) -> ParserSource:
return source return source
def _parsed_profile_key(parsed: dict) -> str:
return f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee: def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
html = parsed.pop("_html", None) html = parsed.pop("_html", None)
checksum = _checksum(parsed) checksum = _checksum(parsed)
key = f"{parsed.get('profile_type')}:{parsed.get('profile_id')}" key = _parsed_profile_key(parsed)
employee = db.scalar(select(Employee).where(Employee.profile_key == key)) employee = db.scalar(select(Employee).where(Employee.profile_key == key))
now = datetime.now(timezone.utc) now = datetime.now(timezone.utc)
if not employee: if not employee:

View File

@@ -171,6 +171,10 @@
background: transparent; background: transparent;
} }
.button--compact {
padding: 8px 12px;
}
.code { .code {
overflow-x: auto; overflow-x: auto;
padding: 14px; padding: 14px;
@@ -201,11 +205,34 @@
gap: 10px; gap: 10px;
} }
.employee-card__actions {
display: grid;
justify-items: end;
gap: 10px;
}
.employee-card__title { .employee-card__title {
margin: 0; margin: 0;
font-size: 24px; font-size: 24px;
} }
.employee-card__notice {
margin: 0;
padding: 12px 14px;
border-radius: 8px;
font-weight: 700;
}
.employee-card__notice--success {
color: #065f46;
background: #d1fae5;
}
.employee-card__notice--error {
color: #991b1b;
background: #fee2e2;
}
.employee-card__section { .employee-card__section {
padding: 20px; padding: 20px;
background: #ffffff; background: #ffffff;

View File

@@ -7,8 +7,18 @@
<h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2> <h2 class="employee-card__title">{{ employee_view.full_name or employee.profile_key }}</h2>
<span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status_display }}</span> <span class="badge {% if employee_view.status == "dismissed" %}badge--dismissed{% endif %}">{{ employee_view.status_display }}</span>
</div> </div>
<a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a> <div class="employee-card__actions">
<form method="post" action="/admin/employees/{{ employee.id }}/refresh">
<button class="button button--compact" type="submit">Обновить данные</button>
</form>
<a class="admin__link" href="{{ employee_view.canonical_url }}">{{ employee_view.canonical_url }}</a>
</div>
</div> </div>
{% if refresh_status == "success" %}
<p class="employee-card__notice employee-card__notice--success">Данные сотрудника обновлены.</p>
{% elif refresh_status == "error" %}
<p class="employee-card__notice employee-card__notice--error">Не удалось обновить данные сотрудника.</p>
{% endif %}
<section class="employee-card__section"> <section class="employee-card__section">
<h3 class="employee-section__title">Основная информация</h3> <h3 class="employee-section__title">Основная информация</h3>

View File

@@ -1,3 +1,3 @@
APP_VERSION = "0.4.5" APP_VERSION = "0.4.7"
FRONTEND_VERSION = "0.4.5" FRONTEND_VERSION = "0.4.7"
BACKEND_VERSION = "0.4.5" BACKEND_VERSION = "0.4.7"

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "miem-workers" name = "miem-workers"
version = "0.4.5" version = "0.4.7"
description = "MIEM employees parser, admin API, and MCP server" description = "MIEM employees parser, admin API, and MCP server"
requires-python = ">=3.11" requires-python = ">=3.11"
dependencies = [ dependencies = [

View File

@@ -1,7 +1,8 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from types import SimpleNamespace
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from sqlalchemy import create_engine from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool from sqlalchemy.pool import StaticPool
@@ -18,7 +19,7 @@ def test_health_returns_versions():
response = client.get("/api/health") response = client.get("/api/health")
assert response.status_code == 200 assert response.status_code == 200
assert response.json()["backend_version"] == "0.4.5" assert response.json()["backend_version"] == "0.4.7"
def test_mcp_lists_tools_without_auth_and_ignores_auth_header(): def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
@@ -248,3 +249,56 @@ def test_api_employees_and_stats_require_admin_session():
assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person" assert run_details.json()["changes"]["new"][0]["full_name"] == "Alpha Person"
app.dependency_overrides.clear() app.dependency_overrides.clear()
def test_admin_refresh_employee_route_updates_only_requested_employee(monkeypatch):
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()
db.add(
Employee(
profile_key="org_person:133709486",
profile_type="org_person",
profile_id="133709486",
canonical_url="https://www.hse.ru/org/persons/133709486",
full_name="Будков Юрий Алексеевич",
status="active",
)
)
db.commit()
employee_id = db.scalar(select(Employee.id))
db.close()
settings = Settings(admin_username="admin", admin_password="password", session_secret="session-secret")
def override_db():
session = Session()
try:
yield session
finally:
session.close()
calls = []
def fake_refresh_employee(db, refreshed_employee, route_settings):
calls.append((refreshed_employee.id, route_settings))
return SimpleNamespace(status="completed")
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: settings
monkeypatch.setattr("app.admin.refresh_employee", fake_refresh_employee)
client = TestClient(app)
client.cookies.set(SESSION_COOKIE, sign_session("admin", settings))
response = client.post(f"/admin/employees/{employee_id}/refresh", follow_redirects=False)
assert response.status_code == 303
assert response.headers["location"] == f"/admin/employees/{employee_id}?refresh_status=success"
assert calls == [(employee_id, settings)]
app.dependency_overrides.clear()

View File

@@ -27,4 +27,6 @@ def test_employee_detail_template_is_human_readable():
assert "Дата увольнения" in template assert "Дата увольнения" in template
assert "Тип профиля" in template assert "Тип профиля" in template
assert "ID профиля" in template assert "ID профиля" in template
assert "Обновить данные" in template
assert 'action="/admin/employees/{{ employee.id }}/refresh"' in template
assert "Снапшоты" in template assert "Снапшоты" in template

View File

@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs, extract_sections
from app.parser.profile_url import normalize_profile_url, parse_profile_identity from app.parser.profile_url import normalize_profile_url, parse_profile_identity
@@ -64,6 +64,47 @@ class FakeSession:
) )
class GroupedPublicationsSession(FakeSession):
def post(self, url, **kwargs):
self.posts.append((url, kwargs))
return FakeResponse(
{
"status": "ok",
"result": {
"more": False,
"total": 1,
"groupType": 2,
"items": {
"year": {
"header": {"ru": "по году", "en": "by year"},
"criteria": {"year": []},
"items": {
"2011": [
{
"id": "146366790",
"type": "ARTICLE",
"title": "Развитие теории самосогласованного поля",
"year": 2011,
"description": {"short": {"ru": "Журнал физической химии 2011."}},
}
],
"2012": [
{
"id": "146367323",
"type": "ARTICLE",
"title": "Self-consistent field theory investigation",
"year": 2012,
"description": {"short": {"en": "Russian Journal of Physical Chemistry A 2012."}},
}
],
},
}
},
},
}
)
def test_normalize_profile_url_supports_staff_and_org_persons(): def test_normalize_profile_url_supports_staff_and_org_persons():
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev" assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123" assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
@@ -117,3 +158,60 @@ def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164" assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs" assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"} assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
def test_enrich_sections_from_hse_widgets_loads_grouped_publications():
soup = BeautifulSoup(
"""
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
""",
"html.parser",
)
session = GroupedPublicationsSession()
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/133709486",
{"User-Agent": "test"},
10,
[],
)
publications = next(section for section in sections if section["type"] == "publications")
assert publications["publications_count"] == 2
assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"]
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790"
assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323"
def test_news_heading_with_publications_word_does_not_absorb_widget_publications():
soup = BeautifulSoup(
"""
<h2>Статья профессора МИЭМ вошла в число самых популярных публикаций на портале SpringerLink</h2>
<div class="post__text">
<p>Первоначально статья профессора вышла в российском журнале.</p>
</div>
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
""",
"html.parser",
)
session = FakeSession()
sections = extract_sections(soup, "https://www.hse.ru/org/persons/133709486")
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/133709486",
{"User-Agent": "test"},
10,
sections,
)
assert sections[0]["type"] == "paragraphs"
assert sections[0]["title"].startswith("Статья профессора")
publications = [section for section in sections if section["type"] == "publications"]
assert len(publications) == 1
assert publications[0]["title"] == "Публикации и исследования"
assert publications[0]["publications_count"] == 1