fix: enrich HSE profile parsing with publications and theses
This commit is contained in:
@@ -1,9 +1,69 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.parser.profile import extract_person_tabs
|
||||
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs
|
||||
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, payload):
|
||||
self.payload = payload
|
||||
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
def json(self):
|
||||
return self.payload
|
||||
|
||||
|
||||
class FakeSession:
|
||||
def __init__(self):
|
||||
self.posts = []
|
||||
self.gets = []
|
||||
|
||||
def post(self, url, **kwargs):
|
||||
self.posts.append((url, kwargs))
|
||||
return FakeResponse(
|
||||
{
|
||||
"status": "ok",
|
||||
"result": {
|
||||
"more": False,
|
||||
"total": 1,
|
||||
"items": [
|
||||
{
|
||||
"id": "888959076",
|
||||
"type": "ARTICLE",
|
||||
"title": "Дублирование пакетов",
|
||||
"year": 2023,
|
||||
"description": {"short": {"ru": "Информационные процессы. 2023."}},
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
def get(self, url, **kwargs):
|
||||
self.gets.append((url, kwargs))
|
||||
return FakeResponse(
|
||||
{
|
||||
"lang": "ru",
|
||||
"success": True,
|
||||
"data": [
|
||||
{
|
||||
"id": 1045750164,
|
||||
"year": 2025,
|
||||
"level": "Бакалавриат",
|
||||
"title": "Аппаратно-программный комплекс защиты сети",
|
||||
"rating": 8,
|
||||
"student": "Лесняк Владислав Евгеньевич",
|
||||
"learnProgram": {"title": "Информатика и вычислительная техника", "url": "https://hse.ru/ba/isct/"},
|
||||
"orgUnit": {"title": "МИЭМ", "url": "https://www.hse.ru/org/url/59315150"},
|
||||
"supervisors": [{"url": "https://www.hse.ru/org/persons/803294906", "name": "Борисов Сергей Петрович"}],
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_profile_url_supports_staff_and_org_persons():
|
||||
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
|
||||
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
|
||||
@@ -26,3 +86,34 @@ def test_extract_person_tabs_prefers_person_menu_addition():
|
||||
|
||||
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
|
||||
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
|
||||
|
||||
|
||||
def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
|
||||
soup = BeautifulSoup(
|
||||
"""
|
||||
<script src="/n/stat/publications/dist-w/publs.js" data-author="568398853" data-widget-name="AuthorSearch"></script>
|
||||
<script src="/n/stat/vkr/app.js" data-api-url="/n/vkr/api/" data-person-id="803294906"></script>
|
||||
""",
|
||||
"html.parser",
|
||||
)
|
||||
session = FakeSession()
|
||||
|
||||
sections = enrich_sections_from_hse_widgets(
|
||||
session,
|
||||
soup,
|
||||
"https://www.hse.ru/org/persons/803294906",
|
||||
{"User-Agent": "test"},
|
||||
10,
|
||||
[],
|
||||
)
|
||||
|
||||
publications = next(section for section in sections if section["type"] == "publications")
|
||||
theses = next(section for section in sections if section["type"] == "graduation_theses")
|
||||
|
||||
assert publications["publications_count"] == 1
|
||||
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
|
||||
assert theses["theses_count"] == 1
|
||||
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
|
||||
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
|
||||
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
|
||||
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
|
||||
|
||||
Reference in New Issue
Block a user