From cc9481fc6c4dce92a226db2435e791494418a868 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 29 Apr 2026 14:15:29 +0300 Subject: [PATCH] fix: enrich HSE profile parsing with publications and theses --- .gitignore | 1 + README.md | 2 +- app/admin.py | 2 +- app/parser/profile.py | 219 +++++++++++++++++++++++++ app/services/admin_data.py | 33 +++- app/static/admin.css | 12 ++ app/templates/directory.html | 6 + app/templates/employee_detail.html | 30 +++- app/version.py | 6 +- pyproject.toml | 2 +- tests/test_admin_data.py | 17 +- tests/test_admin_templates.py | 2 + tests/test_api_mcp.py | 2 +- tests/test_employee_detail_template.py | 2 + tests/test_parser.py | 93 ++++++++++- 15 files changed, 418 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 42a31c9..da20559 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ *.py[cod] *.db .pytest_cache/ +pytest-cache-files-*/ .coverage htmlcov/ postgres_data/ diff --git a/README.md b/README.md index b048ffa..c672721 100644 --- a/README.md +++ b/README.md @@ -110,4 +110,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql docker compose down ``` -Версия сервиса: `0.2.7`. Админка всегда показывает версии backend и frontend в footer. +Версия сервиса: `0.2.8`. Админка всегда показывает версии backend и frontend в footer. diff --git a/app/admin.py b/app/admin.py index 468749a..5fba191 100644 --- a/app/admin.py +++ b/app/admin.py @@ -108,7 +108,7 @@ def directory( "has_email": has_email or "", "sort": sort, "direction": direction, - "limit": limit, + "limit": page["limit"], "offset": offset, }, }, diff --git a/app/parser/profile.py b/app/parser/profile.py index 52d3296..5e6265e 100644 --- a/app/parser/profile.py +++ b/app/parser/profile.py @@ -164,6 +164,7 @@ def parse_person_profile( header = extract_person_header(soup, normalized_url) tabs = extract_person_tabs(soup, normalized_url) sections = extract_sections(soup, normalized_url) + sections = enrich_sections_from_hse_widgets(session, soup, normalized_url, headers, timeout, sections) internal_links = [tab["href"] for tab in tabs if tab.get("href")] return { @@ -183,6 +184,25 @@ def parse_person_profile( } +def enrich_sections_from_hse_widgets( + session: Session, + soup: BeautifulSoup, + source_url: str, + headers: dict[str, str], + timeout: int, + sections: list[dict], +) -> list[dict]: + enriched = list(sections) + publications = _load_widget_publications(session, soup, headers, timeout) + if publications: + enriched = _upsert_publications_section(enriched, publications) + + theses = _load_widget_graduation_theses(session, soup, source_url, headers, timeout) + if theses: + enriched = _upsert_graduation_theses_section(enriched, theses) + return enriched + + def _render_with_playwright(source_url: str, fallback_html: str) -> str: try: from playwright.sync_api import sync_playwright @@ -206,6 +226,89 @@ def _render_with_playwright(source_url: str, fallback_html: str) -> str: return fallback_html +def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: dict[str, str], timeout: int) -> list[dict]: + script = soup.select_one('script[data-widget-name="AuthorSearch"][data-author]') + if not script: + return [] + author_id = normalize_ws(script.get("data-author")) + if not author_id: + return [] + + publications = [] + page_id = 1 + per_page = 100 + while page_id <= 20: + payload = { + "type": "ANY", + "filterParams": ( + f'"acceptLanguage":"ru"|"fullTextPublicEnabled": 1|' + f'"pubsAuthor": {author_id}|"widgetName": "AuthorSearch"' + ), + "paginationParams": { + "publsSort": ["TITLE_ASC"], + "publsCount": per_page, + "pageId": page_id, + }, + } + try: + response = session.post( + "https://publications.hse.ru/api/searchPubs", + json=payload, + headers=headers, + timeout=timeout, + ) + response.raise_for_status() + data = response.json() + except Exception: + return publications + + result = data.get("result") if isinstance(data, dict) else {} + items = result.get("items") if isinstance(result, dict) else [] + if not isinstance(items, list) or not items: + break + publications.extend(_normalize_publication_item(item) for item in items if isinstance(item, dict)) + + total = int(result.get("total") or 0) + if not result.get("more") and len(publications) >= total: + break + page_id += 1 + return _dedupe_publications(publications) + + +def _load_widget_graduation_theses( + session: Session, + soup: BeautifulSoup, + source_url: str, + headers: dict[str, str], + timeout: int, +) -> list[dict]: + script = soup.select_one('script[src*="/n/stat/vkr/app.js"][data-person-id]') + if not script: + return [] + person_id = normalize_ws(script.get("data-person-id")) + api_url = normalize_ws(script.get("data-api-url")) or "/n/vkr/api/" + if not person_id: + return [] + + request_headers = {**headers, "x-portal-language": "ru"} + try: + response = session.get( + urljoin(source_url, api_url), + params={"supervisorId": person_id}, + headers=request_headers, + timeout=timeout, + ) + response.raise_for_status() + data = response.json() + except Exception: + return [] + + items = data.get("data") if isinstance(data, dict) else [] + if not isinstance(items, list): + return [] + return [_normalize_vkr_item(item, source_url) for item in items if isinstance(item, dict)] + + def _collect_between_h2(start_h2: Tag) -> list[Tag | NavigableString | str]: nodes = [] for sibling in start_h2.next_siblings: @@ -353,6 +456,122 @@ def _parse_vkr_items(nodes: list) -> list[str]: return [item for item in dict.fromkeys(items) if item] +def _normalize_publication_item(item: dict) -> dict: + publication_id = str(item.get("id") or "").strip() + title = _html_to_text(item.get("title")) + year = item.get("year") + publication_type = str(item.get("type") or "").strip() or None + description = item.get("description") if isinstance(item.get("description"), dict) else {} + short_description = _localized_value(description.get("short")) or _localized_value(description.get("shortLeft")) + text = normalize_ws(" ".join(part for part in [title, str(year or ""), short_description] if part)) + return { + "id": publication_id or None, + "title": title or publication_id, + "year": year, + "type": publication_type, + "url": f"https://publications.hse.ru/view/{publication_id}" if publication_id else None, + "text": text or title or publication_id, + } + + +def _normalize_vkr_item(item: dict, source_url: str) -> dict: + thesis_id = item.get("id") + program = item.get("learnProgram") if isinstance(item.get("learnProgram"), dict) else {} + org_unit = item.get("orgUnit") if isinstance(item.get("orgUnit"), dict) else {} + supervisors = [] + for supervisor in item.get("supervisors") or []: + if not isinstance(supervisor, dict): + continue + name = normalize_ws(supervisor.get("name")) + url = normalize_ws(supervisor.get("url")) + if name or url: + supervisors.append({"name": name or url, "url": url or None}) + return { + "id": thesis_id, + "student": normalize_ws(item.get("student")), + "title": normalize_ws(item.get("title")), + "defense_year": item.get("year"), + "level": normalize_ws(item.get("level")), + "rating": item.get("rating"), + "project_url": urljoin(source_url, f"/edu/vkr/{thesis_id}") if thesis_id else None, + "program": normalize_ws(program.get("title")), + "program_url": urljoin(source_url, program.get("url")) if program.get("url") else None, + "org_unit": normalize_ws(org_unit.get("title")), + "org_unit_url": urljoin(source_url, org_unit.get("url")) if org_unit.get("url") else None, + "supervisors": supervisors, + "text": normalize_ws(" ".join(str(part) for part in [item.get("student"), item.get("title"), item.get("year")] if part)), + } + + +def _upsert_publications_section(sections: list[dict], publications: list[dict]) -> list[dict]: + merged = [] + inserted = False + for section in sections: + if section.get("type") != "publications": + merged.append(section) + continue + existing = section.get("publications") or [] + section = { + **section, + "publications_count": max(section.get("publications_count") or 0, len(publications)), + "publications": _dedupe_publications([*existing, *publications]), + } + section["items"] = [item["text"] for item in section["publications"] if item.get("text")] + merged.append(section) + inserted = True + if not inserted: + merged.append( + { + "title": "Публикации и исследования", + "slug": "publikacii_i_issledovaniya", + "type": "publications", + "raw_text": "", + "paragraphs": [], + "items": [item["text"] for item in publications if item.get("text")], + "links": [], + "publications_count": len(publications), + "publications": publications, + } + ) + return merged + + +def _upsert_graduation_theses_section(sections: list[dict], theses: list[dict]) -> list[dict]: + section = { + "title": "Выпускные квалификационные работы студентов НИУ ВШЭ", + "slug": "vypusknye_kvalifikacionnye_raboty_studentov_niu_vshe", + "type": "graduation_theses", + "raw_text": "", + "paragraphs": [], + "items": [item["text"] for item in theses if item.get("text")], + "links": [{"text": item["title"], "url": item["project_url"]} for item in theses if item.get("title") and item.get("project_url")], + "theses_count": len(theses), + "theses": theses, + } + return [item for item in sections if item.get("type") != "graduation_theses"] + [section] + + +def _dedupe_publications(items: list[dict]) -> list[dict]: + seen = set() + unique = [] + for item in items: + key = item.get("id") or item.get("url") or item.get("title") + if key and key not in seen: + seen.add(key) + unique.append(item) + return unique + + +def _html_to_text(value: object) -> str: + return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True)) + + +def _localized_value(value: object) -> str: + if isinstance(value, dict): + return normalize_ws(value.get("ru") or value.get("publ") or value.get("en")) + return normalize_ws(str(value or "")) + + def _slugify(value: str) -> str: cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE) return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section" diff --git a/app/services/admin_data.py b/app/services/admin_data.py index 18b531d..09fc9de 100644 --- a/app/services/admin_data.py +++ b/app/services/admin_data.py @@ -112,7 +112,7 @@ def list_employees_page( limit: int = 50, offset: int = 0, ) -> dict[str, Any]: - limit = max(1, min(limit, 200)) + limit = limit if limit in {25, 50, 100} else 50 offset = max(0, offset) base_stmt = build_employee_query( status=status, @@ -281,6 +281,8 @@ def _normalize_section(section: Any) -> dict[str, Any]: "year_entries": _normalize_year_entries(section.get("year_entries")), "publications": _normalize_publications(section.get("publications")), "publications_count": section.get("publications_count"), + "theses": _normalize_theses(section.get("theses")), + "theses_count": section.get("theses_count"), "academic_year": section.get("academic_year"), "courses": _normalize_courses(section.get("courses")), "table": _normalize_table(section.get("table")), @@ -349,6 +351,35 @@ def _normalize_courses(items: Any) -> list[dict[str, str | None]]: return normalized +def _normalize_theses(items: Any) -> list[dict[str, Any]]: + normalized = [] + if not isinstance(items, list): + return normalized + for item in items: + if not isinstance(item, dict): + continue + title = str(item.get("title") or "").strip() + student = str(item.get("student") or "").strip() + if not title and not student: + continue + normalized.append( + { + "id": item.get("id"), + "student": student, + "title": title, + "defense_year": item.get("defense_year") or item.get("year"), + "level": str(item.get("level") or "").strip(), + "rating": item.get("rating"), + "project_url": str(item.get("project_url") or "").strip() or None, + "program": str(item.get("program") or "").strip(), + "program_url": str(item.get("program_url") or "").strip() or None, + "org_unit": str(item.get("org_unit") or "").strip(), + "org_unit_url": str(item.get("org_unit_url") or "").strip() or None, + } + ) + return normalized + + def _normalize_table(table: Any) -> dict[str, Any] | None: if not isinstance(table, dict): return None diff --git a/app/static/admin.css b/app/static/admin.css index 6a668c5..6360606 100644 --- a/app/static/admin.css +++ b/app/static/admin.css @@ -270,6 +270,18 @@ line-height: 1.55; } +.employee-section__meta { + display: flex; + flex-wrap: wrap; + gap: 8px 12px; + color: #4b5563; + font-size: 13px; +} + +.employee-section__meta-item { + line-height: 1.4; +} + .employee-section__table-wrap { overflow-x: auto; } diff --git a/app/templates/directory.html b/app/templates/directory.html index 8570a12..c0cd64e 100644 --- a/app/templates/directory.html +++ b/app/templates/directory.html @@ -33,6 +33,12 @@ + + diff --git a/app/templates/employee_detail.html b/app/templates/employee_detail.html index 6d647a5..d544d2f 100644 --- a/app/templates/employee_detail.html +++ b/app/templates/employee_detail.html @@ -138,6 +138,34 @@ {% endfor %} + {% elif section.type == "graduation_theses" and section.theses %} + {% if section.theses_count %}

Всего: {{ section.theses_count }}

{% endif %} + {% elif section.type == "table" and section.table %}
@@ -170,7 +198,7 @@ {% endif %} {% endif %} - {% if section.links and section.type not in ["courses_by_year"] %} + {% if section.links and section.type not in ["courses_by_year", "graduation_theses"] %}