Merge pull request 'fix: support grouped HSE publication API responses' (#21) from fix/grouped-publications-parser into main
Reviewed-on: #21
This commit was merged in pull request #21.
This commit is contained in:
@@ -263,10 +263,10 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di
|
|||||||
return publications
|
return publications
|
||||||
|
|
||||||
result = data.get("result") if isinstance(data, dict) else {}
|
result = data.get("result") if isinstance(data, dict) else {}
|
||||||
items = result.get("items") if isinstance(result, dict) else []
|
items = _extract_publication_items(result)
|
||||||
if not isinstance(items, list) or not items:
|
if not items:
|
||||||
break
|
break
|
||||||
publications.extend(_normalize_publication_item(item) for item in items if isinstance(item, dict))
|
publications.extend(_normalize_publication_item(item) for item in items)
|
||||||
|
|
||||||
total = int(result.get("total") or 0)
|
total = int(result.get("total") or 0)
|
||||||
if not result.get("more") and len(publications) >= total:
|
if not result.get("more") and len(publications) >= total:
|
||||||
@@ -275,6 +275,34 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di
|
|||||||
return _dedupe_publications(publications)
|
return _dedupe_publications(publications)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_publication_items(result: object) -> list[dict]:
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
return []
|
||||||
|
return _flatten_publication_items(result.get("items"))
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_publication_items(value: object) -> list[dict]:
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [item for item in value if _is_publication_item(item)]
|
||||||
|
if not isinstance(value, dict):
|
||||||
|
return []
|
||||||
|
|
||||||
|
nested_items = value.get("items")
|
||||||
|
if isinstance(nested_items, list):
|
||||||
|
return [item for item in nested_items if _is_publication_item(item)]
|
||||||
|
if isinstance(nested_items, dict):
|
||||||
|
return _flatten_publication_items(nested_items)
|
||||||
|
|
||||||
|
publications = []
|
||||||
|
for child in value.values():
|
||||||
|
publications.extend(_flatten_publication_items(child))
|
||||||
|
return publications
|
||||||
|
|
||||||
|
|
||||||
|
def _is_publication_item(value: object) -> bool:
|
||||||
|
return isinstance(value, dict) and ("id" in value or "title" in value)
|
||||||
|
|
||||||
|
|
||||||
def _load_widget_graduation_theses(
|
def _load_widget_graduation_theses(
|
||||||
session: Session,
|
session: Session,
|
||||||
soup: BeautifulSoup,
|
soup: BeautifulSoup,
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
APP_VERSION = "0.4.5"
|
APP_VERSION = "0.4.6"
|
||||||
FRONTEND_VERSION = "0.4.5"
|
FRONTEND_VERSION = "0.4.6"
|
||||||
BACKEND_VERSION = "0.4.5"
|
BACKEND_VERSION = "0.4.6"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "miem-workers"
|
name = "miem-workers"
|
||||||
version = "0.4.5"
|
version = "0.4.6"
|
||||||
description = "MIEM employees parser, admin API, and MCP server"
|
description = "MIEM employees parser, admin API, and MCP server"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ def test_health_returns_versions():
|
|||||||
response = client.get("/api/health")
|
response = client.get("/api/health")
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json()["backend_version"] == "0.4.5"
|
assert response.json()["backend_version"] == "0.4.6"
|
||||||
|
|
||||||
|
|
||||||
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
|
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
|
||||||
|
|||||||
@@ -64,6 +64,47 @@ class FakeSession:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class GroupedPublicationsSession(FakeSession):
|
||||||
|
def post(self, url, **kwargs):
|
||||||
|
self.posts.append((url, kwargs))
|
||||||
|
return FakeResponse(
|
||||||
|
{
|
||||||
|
"status": "ok",
|
||||||
|
"result": {
|
||||||
|
"more": False,
|
||||||
|
"total": 1,
|
||||||
|
"groupType": 2,
|
||||||
|
"items": {
|
||||||
|
"year": {
|
||||||
|
"header": {"ru": "по году", "en": "by year"},
|
||||||
|
"criteria": {"year": []},
|
||||||
|
"items": {
|
||||||
|
"2011": [
|
||||||
|
{
|
||||||
|
"id": "146366790",
|
||||||
|
"type": "ARTICLE",
|
||||||
|
"title": "Развитие теории самосогласованного поля",
|
||||||
|
"year": 2011,
|
||||||
|
"description": {"short": {"ru": "Журнал физической химии 2011."}},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"2012": [
|
||||||
|
{
|
||||||
|
"id": "146367323",
|
||||||
|
"type": "ARTICLE",
|
||||||
|
"title": "Self-consistent field theory investigation",
|
||||||
|
"year": 2012,
|
||||||
|
"description": {"short": {"en": "Russian Journal of Physical Chemistry A 2012."}},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_profile_url_supports_staff_and_org_persons():
|
def test_normalize_profile_url_supports_staff_and_org_persons():
|
||||||
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
|
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
|
||||||
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
|
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
|
||||||
@@ -117,3 +158,29 @@ def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
|
|||||||
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
|
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
|
||||||
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
|
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
|
||||||
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
|
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_sections_from_hse_widgets_loads_grouped_publications():
|
||||||
|
soup = BeautifulSoup(
|
||||||
|
"""
|
||||||
|
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
|
||||||
|
""",
|
||||||
|
"html.parser",
|
||||||
|
)
|
||||||
|
session = GroupedPublicationsSession()
|
||||||
|
|
||||||
|
sections = enrich_sections_from_hse_widgets(
|
||||||
|
session,
|
||||||
|
soup,
|
||||||
|
"https://www.hse.ru/org/persons/133709486",
|
||||||
|
{"User-Agent": "test"},
|
||||||
|
10,
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
|
||||||
|
publications = next(section for section in sections if section["type"] == "publications")
|
||||||
|
|
||||||
|
assert publications["publications_count"] == 2
|
||||||
|
assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"]
|
||||||
|
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790"
|
||||||
|
assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323"
|
||||||
|
|||||||
Reference in New Issue
Block a user