From efa7192e45e113cb044de33eede1a2acec190744 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 13 May 2026 12:46:07 +0300 Subject: [PATCH] fix: support grouped HSE publication API responses --- app/parser/profile.py | 34 ++++++++++++++++++++-- app/version.py | 6 ++-- pyproject.toml | 2 +- tests/test_api_mcp.py | 2 +- tests/test_parser.py | 67 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 103 insertions(+), 8 deletions(-) diff --git a/app/parser/profile.py b/app/parser/profile.py index 5e6265e..84c62fa 100644 --- a/app/parser/profile.py +++ b/app/parser/profile.py @@ -263,10 +263,10 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di return publications result = data.get("result") if isinstance(data, dict) else {} - items = result.get("items") if isinstance(result, dict) else [] - if not isinstance(items, list) or not items: + items = _extract_publication_items(result) + if not items: break - publications.extend(_normalize_publication_item(item) for item in items if isinstance(item, dict)) + publications.extend(_normalize_publication_item(item) for item in items) total = int(result.get("total") or 0) if not result.get("more") and len(publications) >= total: @@ -275,6 +275,34 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di return _dedupe_publications(publications) +def _extract_publication_items(result: object) -> list[dict]: + if not isinstance(result, dict): + return [] + return _flatten_publication_items(result.get("items")) + + +def _flatten_publication_items(value: object) -> list[dict]: + if isinstance(value, list): + return [item for item in value if _is_publication_item(item)] + if not isinstance(value, dict): + return [] + + nested_items = value.get("items") + if isinstance(nested_items, list): + return [item for item in nested_items if _is_publication_item(item)] + if isinstance(nested_items, dict): + return _flatten_publication_items(nested_items) + + publications = [] + for child in value.values(): + publications.extend(_flatten_publication_items(child)) + return publications + + +def _is_publication_item(value: object) -> bool: + return isinstance(value, dict) and ("id" in value or "title" in value) + + def _load_widget_graduation_theses( session: Session, soup: BeautifulSoup, diff --git a/app/version.py b/app/version.py index 3043cd4..0ecc9df 100644 --- a/app/version.py +++ b/app/version.py @@ -1,3 +1,3 @@ -APP_VERSION = "0.4.5" -FRONTEND_VERSION = "0.4.5" -BACKEND_VERSION = "0.4.5" +APP_VERSION = "0.4.6" +FRONTEND_VERSION = "0.4.6" +BACKEND_VERSION = "0.4.6" diff --git a/pyproject.toml b/pyproject.toml index 9a1e57a..0f81068 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "miem-workers" -version = "0.4.5" +version = "0.4.6" description = "MIEM employees parser, admin API, and MCP server" requires-python = ">=3.11" dependencies = [ diff --git a/tests/test_api_mcp.py b/tests/test_api_mcp.py index 5bd91ea..90fbc6d 100644 --- a/tests/test_api_mcp.py +++ b/tests/test_api_mcp.py @@ -18,7 +18,7 @@ def test_health_returns_versions(): response = client.get("/api/health") assert response.status_code == 200 - assert response.json()["backend_version"] == "0.4.5" + assert response.json()["backend_version"] == "0.4.6" def test_mcp_lists_tools_without_auth_and_ignores_auth_header(): diff --git a/tests/test_parser.py b/tests/test_parser.py index f1e933a..2347e12 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -64,6 +64,47 @@ class FakeSession: ) +class GroupedPublicationsSession(FakeSession): + def post(self, url, **kwargs): + self.posts.append((url, kwargs)) + return FakeResponse( + { + "status": "ok", + "result": { + "more": False, + "total": 1, + "groupType": 2, + "items": { + "year": { + "header": {"ru": "по году", "en": "by year"}, + "criteria": {"year": []}, + "items": { + "2011": [ + { + "id": "146366790", + "type": "ARTICLE", + "title": "Развитие теории самосогласованного поля", + "year": 2011, + "description": {"short": {"ru": "Журнал физической химии 2011."}}, + } + ], + "2012": [ + { + "id": "146367323", + "type": "ARTICLE", + "title": "Self-consistent field theory investigation", + "year": 2012, + "description": {"short": {"en": "Russian Journal of Physical Chemistry A 2012."}}, + } + ], + }, + } + }, + }, + } + ) + + def test_normalize_profile_url_supports_staff_and_org_persons(): assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev" assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123" @@ -117,3 +158,29 @@ def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr(): assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164" assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs" assert session.gets[0][1]["params"] == {"supervisorId": "803294906"} + + +def test_enrich_sections_from_hse_widgets_loads_grouped_publications(): + soup = BeautifulSoup( + """ + + """, + "html.parser", + ) + session = GroupedPublicationsSession() + + sections = enrich_sections_from_hse_widgets( + session, + soup, + "https://www.hse.ru/org/persons/133709486", + {"User-Agent": "test"}, + 10, + [], + ) + + publications = next(section for section in sections if section["type"] == "publications") + + assert publications["publications_count"] == 2 + assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"] + assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790" + assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323" -- 2.49.1