fix: enrich HSE profile parsing with publications and theses

This commit is contained in:
Anton
2026-04-29 14:15:29 +03:00
parent cf578ce699
commit cc9481fc6c
15 changed files with 418 additions and 11 deletions

View File

@@ -85,6 +85,19 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
"academic_year": "2025/2026",
"courses": [{"title": "Course", "url": "https://example.test/course"}],
},
{
"title": "ВКР",
"type": "graduation_theses",
"theses_count": 1,
"theses": [
{
"student": "Student Name",
"title": "Thesis title",
"defense_year": 2025,
"project_url": "https://www.hse.ru/edu/vkr/1",
}
],
},
{
"title": "Fallback",
"type": "generic",
@@ -102,7 +115,8 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
assert payload["sections"][0]["year_entries"][0]["text"] == "Master degree"
assert payload["sections"][1]["publications"][0]["title"] == "Paper"
assert payload["sections"][2]["courses"][0]["title"] == "Course"
assert payload["sections"][3]["paragraphs"] == ["Fallback text"]
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
def test_employee_payloads_tolerate_malformed_current_data(db_session):
@@ -155,6 +169,7 @@ def test_list_employees_page_filters_sorts_and_paginates(db_session):
assert page["total"] == 1
assert page["employees"][0]["full_name"] == "Alpha"
assert page["limit"] == 50
def test_stats_payload_uses_latest_run_new_count(db_session):

View File

@@ -18,6 +18,8 @@ def test_directory_template_is_russian_and_uses_display_dates():
assert "Сотрудники" in template
assert "Колонки" in template
assert "Применить" in template
assert "На странице: {{ value }}" in template
assert "{% for value in [25, 50, 100] %}" in template
assert "Найдено:" in template
assert "employee.first_seen_display" in template
assert "employee.last_seen_display" in template

View File

@@ -18,7 +18,7 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.2.7"
assert response.json()["backend_version"] == "0.2.8"
def test_mcp_requires_token_and_lists_tools():

View File

@@ -14,6 +14,8 @@ def test_employee_detail_template_is_human_readable():
assert "Основная информация" in template
assert "Контакты" in template
assert "Разделы профиля" in template
assert "graduation_theses" in template
assert "Год защиты" in template
assert "Parser version" not in template
assert "First seen" not in template
assert "Last seen" not in template

View File

@@ -1,9 +1,69 @@
from bs4 import BeautifulSoup
from app.parser.profile import extract_person_tabs
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
class FakeResponse:
def __init__(self, payload):
self.payload = payload
def raise_for_status(self):
return None
def json(self):
return self.payload
class FakeSession:
def __init__(self):
self.posts = []
self.gets = []
def post(self, url, **kwargs):
self.posts.append((url, kwargs))
return FakeResponse(
{
"status": "ok",
"result": {
"more": False,
"total": 1,
"items": [
{
"id": "888959076",
"type": "ARTICLE",
"title": "Дублирование пакетов",
"year": 2023,
"description": {"short": {"ru": "Информационные процессы. 2023."}},
}
],
},
}
)
def get(self, url, **kwargs):
self.gets.append((url, kwargs))
return FakeResponse(
{
"lang": "ru",
"success": True,
"data": [
{
"id": 1045750164,
"year": 2025,
"level": "Бакалавриат",
"title": "Аппаратно-программный комплекс защиты сети",
"rating": 8,
"student": "Лесняк Владислав Евгеньевич",
"learnProgram": {"title": "Информатика и вычислительная техника", "url": "https://hse.ru/ba/isct/"},
"orgUnit": {"title": "МИЭМ", "url": "https://www.hse.ru/org/url/59315150"},
"supervisors": [{"url": "https://www.hse.ru/org/persons/803294906", "name": "Борисов Сергей Петрович"}],
}
],
}
)
def test_normalize_profile_url_supports_staff_and_org_persons():
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
@@ -26,3 +86,34 @@ def test_extract_person_tabs_prefers_person_menu_addition():
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
soup = BeautifulSoup(
"""
<script src="/n/stat/publications/dist-w/publs.js" data-author="568398853" data-widget-name="AuthorSearch"></script>
<script src="/n/stat/vkr/app.js" data-api-url="/n/vkr/api/" data-person-id="803294906"></script>
""",
"html.parser",
)
session = FakeSession()
sections = enrich_sections_from_hse_widgets(
session,
soup,
"https://www.hse.ru/org/persons/803294906",
{"User-Agent": "test"},
10,
[],
)
publications = next(section for section in sections if section["type"] == "publications")
theses = next(section for section in sections if section["type"] == "graduation_theses")
assert publications["publications_count"] == 1
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
assert theses["theses_count"] == 1
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}