235 lines
9.7 KiB
Python
235 lines
9.7 KiB
Python
from bs4 import BeautifulSoup
|
||
|
||
from app.parser.profile import enrich_sections_from_hse_widgets, extract_person_tabs, extract_sections
|
||
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
|
||
|
||
|
||
class FakeResponse:
|
||
def __init__(self, payload):
|
||
self.payload = payload
|
||
|
||
def raise_for_status(self):
|
||
return None
|
||
|
||
def json(self):
|
||
return self.payload
|
||
|
||
|
||
class FakeSession:
|
||
def __init__(self):
|
||
self.posts = []
|
||
self.gets = []
|
||
|
||
def post(self, url, **kwargs):
|
||
self.posts.append((url, kwargs))
|
||
return FakeResponse(
|
||
{
|
||
"status": "ok",
|
||
"result": {
|
||
"more": False,
|
||
"total": 1,
|
||
"items": [
|
||
{
|
||
"id": "888959076",
|
||
"type": "ARTICLE",
|
||
"title": "Дублирование пакетов",
|
||
"year": 2023,
|
||
"language": {"name": "ru"},
|
||
"status": 1,
|
||
"authorsByType": {
|
||
"author": [
|
||
{
|
||
"id": "568398853",
|
||
"href": "/org/persons/568398853",
|
||
"title": {"ru": "Левицкий И. А.", "en": ""},
|
||
"reverseTitle": {"ru": "И. А. Левицкий", "en": ""},
|
||
}
|
||
]
|
||
},
|
||
"description": {"short": {"ru": "Информационные процессы. 2023."}},
|
||
"annotation": {"ru": "<p>Русская аннотация</p>"},
|
||
"documents": {"DOI": {"href": "https://doi.org/10.1/test"}},
|
||
}
|
||
],
|
||
},
|
||
}
|
||
)
|
||
|
||
def get(self, url, **kwargs):
|
||
self.gets.append((url, kwargs))
|
||
return FakeResponse(
|
||
{
|
||
"lang": "ru",
|
||
"success": True,
|
||
"data": [
|
||
{
|
||
"id": 1045750164,
|
||
"year": 2025,
|
||
"level": "Бакалавриат",
|
||
"title": "Аппаратно-программный комплекс защиты сети",
|
||
"rating": 8,
|
||
"student": "Лесняк Владислав Евгеньевич",
|
||
"learnProgram": {"title": "Информатика и вычислительная техника", "url": "https://hse.ru/ba/isct/"},
|
||
"orgUnit": {"title": "МИЭМ", "url": "https://www.hse.ru/org/url/59315150"},
|
||
"supervisors": [{"url": "https://www.hse.ru/org/persons/803294906", "name": "Борисов Сергей Петрович"}],
|
||
}
|
||
],
|
||
}
|
||
)
|
||
|
||
|
||
class GroupedPublicationsSession(FakeSession):
|
||
def post(self, url, **kwargs):
|
||
self.posts.append((url, kwargs))
|
||
return FakeResponse(
|
||
{
|
||
"status": "ok",
|
||
"result": {
|
||
"more": False,
|
||
"total": 1,
|
||
"groupType": 2,
|
||
"items": {
|
||
"year": {
|
||
"header": {"ru": "по году", "en": "by year"},
|
||
"criteria": {"year": []},
|
||
"items": {
|
||
"2011": [
|
||
{
|
||
"id": "146366790",
|
||
"type": "ARTICLE",
|
||
"title": "Развитие теории самосогласованного поля",
|
||
"year": 2011,
|
||
"description": {"short": {"ru": "Журнал физической химии 2011."}},
|
||
}
|
||
],
|
||
"2012": [
|
||
{
|
||
"id": "146367323",
|
||
"type": "ARTICLE",
|
||
"title": "Self-consistent field theory investigation",
|
||
"year": 2012,
|
||
"description": {"short": {"en": "Russian Journal of Physical Chemistry A 2012."}},
|
||
}
|
||
],
|
||
},
|
||
}
|
||
},
|
||
},
|
||
}
|
||
)
|
||
|
||
|
||
def test_normalize_profile_url_supports_staff_and_org_persons():
|
||
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
|
||
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
|
||
assert parse_profile_identity("https://www.hse.ru/staff/avsergeev") == ("staff", "avsergeev")
|
||
|
||
|
||
def test_extract_person_tabs_prefers_person_menu_addition():
|
||
soup = BeautifulSoup(
|
||
"""
|
||
<div class="person-menu is-desktop small person-menu-addition">
|
||
<a href="#main">Домашняя страница</a>
|
||
<a href="#sci" data-index="1">Публикации</a>
|
||
</div>
|
||
<a href="/org/persons/999">Other person</a>
|
||
""",
|
||
"html.parser",
|
||
)
|
||
|
||
tabs = extract_person_tabs(soup, "https://www.hse.ru/staff/avsergeev")
|
||
|
||
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
|
||
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"
|
||
|
||
|
||
def test_enrich_sections_from_hse_widgets_loads_publications_and_vkr():
|
||
soup = BeautifulSoup(
|
||
"""
|
||
<script src="/n/stat/publications/dist-w/publs.js" data-author="568398853" data-widget-name="AuthorSearch"></script>
|
||
<script src="/n/stat/vkr/app.js" data-api-url="/n/vkr/api/" data-person-id="803294906"></script>
|
||
""",
|
||
"html.parser",
|
||
)
|
||
session = FakeSession()
|
||
|
||
sections = enrich_sections_from_hse_widgets(
|
||
session,
|
||
soup,
|
||
"https://www.hse.ru/org/persons/803294906",
|
||
{"User-Agent": "test"},
|
||
10,
|
||
[],
|
||
)
|
||
|
||
publications = next(section for section in sections if section["type"] == "publications")
|
||
theses = next(section for section in sections if section["type"] == "graduation_theses")
|
||
|
||
assert publications["publications_count"] == 1
|
||
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/888959076"
|
||
assert publications["publications"][0]["doi_url"] == "https://doi.org/10.1/test"
|
||
assert publications["publications"][0]["annotation"] == {"ru": "Русская аннотация"}
|
||
assert publications["publications"][0]["authors"][0]["is_current_employee"] is True
|
||
assert theses["theses_count"] == 1
|
||
assert theses["theses"][0]["student"] == "Лесняк Владислав Евгеньевич"
|
||
assert theses["theses"][0]["project_url"] == "https://www.hse.ru/edu/vkr/1045750164"
|
||
assert session.posts[0][0] == "https://publications.hse.ru/api/searchPubs"
|
||
assert session.gets[0][1]["params"] == {"supervisorId": "803294906"}
|
||
|
||
|
||
def test_enrich_sections_from_hse_widgets_loads_grouped_publications():
|
||
soup = BeautifulSoup(
|
||
"""
|
||
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
|
||
""",
|
||
"html.parser",
|
||
)
|
||
session = GroupedPublicationsSession()
|
||
|
||
sections = enrich_sections_from_hse_widgets(
|
||
session,
|
||
soup,
|
||
"https://www.hse.ru/org/persons/133709486",
|
||
{"User-Agent": "test"},
|
||
10,
|
||
[],
|
||
)
|
||
|
||
publications = next(section for section in sections if section["type"] == "publications")
|
||
|
||
assert publications["publications_count"] == 2
|
||
assert [item["id"] for item in publications["publications"]] == ["146366790", "146367323"]
|
||
assert publications["publications"][0]["url"] == "https://publications.hse.ru/view/146366790"
|
||
assert publications["publications"][1]["url"] == "https://publications.hse.ru/view/146367323"
|
||
|
||
|
||
def test_news_heading_with_publications_word_does_not_absorb_widget_publications():
|
||
soup = BeautifulSoup(
|
||
"""
|
||
<h2>Статья профессора МИЭМ вошла в число самых популярных публикаций на портале SpringerLink</h2>
|
||
<div class="post__text">
|
||
<p>Первоначально статья профессора вышла в российском журнале.</p>
|
||
</div>
|
||
<script src="/n/stat/publications/dist-w/publs.js" data-author="133709486" data-widget-name="AuthorSearch"></script>
|
||
""",
|
||
"html.parser",
|
||
)
|
||
session = FakeSession()
|
||
|
||
sections = extract_sections(soup, "https://www.hse.ru/org/persons/133709486")
|
||
sections = enrich_sections_from_hse_widgets(
|
||
session,
|
||
soup,
|
||
"https://www.hse.ru/org/persons/133709486",
|
||
{"User-Agent": "test"},
|
||
10,
|
||
sections,
|
||
)
|
||
|
||
assert sections[0]["type"] == "paragraphs"
|
||
assert sections[0]["title"].startswith("Статья профессора")
|
||
publications = [section for section in sections if section["type"] == "publications"]
|
||
assert len(publications) == 1
|
||
assert publications[0]["title"] == "Публикации и исследования"
|
||
assert publications[0]["publications_count"] == 1
|