187 lines
7.3 KiB
Python
187 lines
7.3 KiB
Python
from bs4 import BeautifulSoup
|
|
|
|
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs
|
|
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, payload):
|
|
self.payload = payload
|
|
|
|
def raise_for_status(self):
|
|
return None
|
|
|
|
def json(self):
|
|
return self.payload
|
|
|
|
|
|
class FakeSession:
|
|
def __init__(self):
|
|
self.posts = []
|
|
self.gets = []
|
|
|
|
def post(self, url, **kwargs):
|
|
self.posts.append((url, kwargs))
|
|
return FakeResponse(
|
|
{
|
|
"status": "ok",
|
|
"result": {
|
|
"more": False,
|
|
"total": 1,
|
|
"items": [
|
|
{
|
|
"id": "888959076",
|
|
"type": "ARTICLE",
|
|
"title": "Дублирование пакетов",
|
|
"year": 2023,
|
|
"description": {"short": {"ru": "Информационные процессы. 2023."}},
|
|
}
|
|
],
|
|
},
|
|
}
|
|
)
|
|
|
|
def get(self, url, **kwargs):
|
|
self.gets.append((url, kwargs))
|
|
return FakeResponse(
|
|
{
|
|
"lang": "ru",
|
|
"success": True,
|
|
"data": [
|
|
{
|
|
"id": 1045750164,
|
|
"year": 2025,
|
|
"level": "Бакалавриат",
|
|
"title": "Аппаратно-программный комплекс защиты сети",
|
|
"rating": 8,
|
|
"student": "Лесняк Владислав Евгеньевич",
|
|
"learnProgram": {"title": "Информатика и вычислительная техника", "url": "https://hse.ru/ba/isct/"},
|
|
"orgUnit": {"title": "МИЭМ", "url": "https://www.hse.ru/org/url/59315150"},
|
|
"supervisors": [{"url": "https://www.hse.ru/org/persons/803294906", "name": "Борисов Сергей Петрович"}],
|
|
}
|
|
],
|
|
}
|
|
)
|
|
|
|
|
|
class GroupedPublicationsSession(FakeSession):
|
|
def post(self, url, **kwargs):
|
|
self.posts.append((url, kwargs))
|
|
return FakeResponse(
|
|
{
|
|
"status": "ok",
|
|
"result": {
|
|
"more": False,
|
|
"total": 1,
|
|
"groupType": 2,
|
|
"items": {
|
|
"year": {
|
|
"header": {"ru": "по году", "en": "by year"},
|
|
"criteria": {"year": []},
|
|
"items": {
|
|
"2011": [
|
|
{
|
|
"id": "146366790",
|
|
"type": "ARTICLE",
|
|
"title": "Развитие теории самосогласованного поля",
|
|
"year": 2011,
|
|
"description": {"short": {"ru": "Журнал физической химии 2011."}},
|
|
}
|
|
],
|
|
"2012": [
|
|
{
|
|
"id": "146367323",
|
|
"type": "ARTICLE",
|
|
"title": "Self-consistent field theory investigation",
|
|
"year": 2012,
|
|
"description": {"short": {"en": "Russian Journal of Physical Chemistry A 2012."}},
|
|
}
|
|
],
|
|
},
|
|
}
|
|
},
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
def test_normalize_profile_url_supports_staff_and_org_persons():
|
|
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
|
|
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
|
|
assert parse_profile_identity("https://www.hse.ru/staff/avsergeev") == ("staff", "avsergeev")
|
|
|
|
|
|
def test_extract_person_tabs_prefers_person_menu_addition():
|
|
soup = BeautifulSoup(
|
|
"""
|
|
<div class="person-menu is-desktop small person-menu-addition">
|
|
<a href="#main">Домашняя страница</a>
|
|
<a href="#sci" data-index="1">Публикации</a>
|
|
</div>
|
|
<a href="/org/persons/999">Other person</a>
|
|
""",
|
|
"html.parser",
|
|
)
|
|
|
|
tabs = extract_person_tabs(soup, "https://www.hse.ru/staff/avsergeev")
|
|
|
|
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
|
|
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
|
|
|
|
|
|
def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
|
|
soup = BeautifulSoup(
|
|
"""
|
|
<script src="/n/stat/publications/dist-w/publs.js" data-author="568398853" data-widget-name="AuthorSearch"></script>
|
|
<script src="/n/stat/vkr/app.js" data-api-url="/n/vkr/api/" data-person-id="803294906"></script>
|
|
""",
|
|
"html.parser",
|
|
)
|
|
session = FakeSession()
|
|
|
|
sections = enrich_sections_from_hse_widgets(
|
|
session,
|
|
soup,
|
|
"https://www.hse.ru/org/persons/803294906",
|
|
{"User-Agent": "test"},
|
|
10,
|
|
[],
|
|
)
|
|
|
|
publications = next(section for section in sections if section["type"] == "publications")
|
|
theses = next(section for section in sections if section["type"] == "graduation_theses")
|
|
|
|
assert publications["publications_count"] == 1
|
|
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
|
|
assert theses["theses_count"] == 1
|
|
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
|
|
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
|
|
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
|
|
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
|
|
|
|
|
|
def test_enrich_sections_from_hse_widgets_loads_grouped_publications():
|
|
soup = BeautifulSoup(
|
|
"""
|
|
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
|
|
""",
|
|
"html.parser",
|
|
)
|
|
session = GroupedPublicationsSession()
|
|
|
|
sections = enrich_sections_from_hse_widgets(
|
|
session,
|
|
soup,
|
|
"https://www.hse.ru/org/persons/133709486",
|
|
{"User-Agent": "test"},
|
|
10,
|
|
[],
|
|
)
|
|
|
|
publications = next(section for section in sections if section["type"] == "publications")
|
|
|
|
assert publications["publications_count"] == 2
|
|
assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"]
|
|
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790"
|
|
assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323"
|