302 lines
11 KiB
Python
302 lines
11 KiB
Python
from datetime import datetime, timezone
|
||
|
||
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeNewsLink
|
||
from app.services.admin_data import (
|
||
employee_detail_payload,
|
||
employee_display_payload,
|
||
format_admin_datetime,
|
||
list_employees_page,
|
||
run_detail_payload,
|
||
run_payload,
|
||
stats_payload,
|
||
)
|
||
|
||
|
||
def test_format_admin_datetime_handles_datetime_string_and_none():
|
||
value = datetime(2026, 4, 28, 17, 13, 34, tzinfo=timezone.utc)
|
||
|
||
assert format_admin_datetime(value) == "28.04.2026 20:13"
|
||
assert format_admin_datetime("2026-04-28T17:13:34.448605+00:00") == "28.04.2026 20:13"
|
||
assert format_admin_datetime(None) == "Не указано"
|
||
|
||
|
||
def test_employee_display_payload_extracts_common_fields(db_session):
|
||
employee = Employee(
|
||
profile_key="staff:person",
|
||
canonical_url="https://www.hse.ru/staff/person",
|
||
full_name="Person Name",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
current_data={
|
||
"positions": ["Professor"],
|
||
"hse_start_year": 2024,
|
||
"contacts": {"emails": ["person@hse.ru"], "phones": ["+79990000000"], "address": "Moscow"},
|
||
"sections": [
|
||
{"type": "publications", "publications": [{"title": "Paper"}]},
|
||
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
|
||
{"type": "news", "news_links": [{"title": "News", "url": "https://example.test/news"}]},
|
||
],
|
||
},
|
||
)
|
||
|
||
payload = employee_display_payload(employee)
|
||
|
||
assert payload["positions_text"] == "Professor"
|
||
assert payload["status_display"] == "Работает"
|
||
assert payload["email_text"] == "person@hse.ru"
|
||
assert payload["publications_count"] == 1
|
||
assert payload["courses_count"] == 1
|
||
assert payload["news_count"] == 1
|
||
assert payload["first_seen_display"] != "Не указано"
|
||
|
||
|
||
def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
|
||
employee = Employee(
|
||
profile_key="staff:person",
|
||
profile_type="staff",
|
||
profile_id="person",
|
||
canonical_url="https://www.hse.ru/staff/person",
|
||
full_name="Person Name",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
current_data={
|
||
"positions": ["Professor"],
|
||
"hse_start_year": 2024,
|
||
"contacts": {
|
||
"emails": ["person@hse.ru"],
|
||
"phones": ["+79990000000"],
|
||
"address": "Moscow",
|
||
"items": [{"raw": "consultation hours"}],
|
||
},
|
||
"external_ids": [{"system": "ORCID", "value": "0000", "url": "https://orcid.org/0000"}],
|
||
"sections": [
|
||
{
|
||
"title": "Education",
|
||
"type": "year_blocks",
|
||
"year_entries": [{"year": 2020, "text": "Master degree"}],
|
||
},
|
||
{
|
||
"title": "Publications",
|
||
"type": "publications",
|
||
"publications": [{"title": "Paper", "text": "Paper details", "url": "https://example.test/paper"}],
|
||
},
|
||
{
|
||
"title": "Courses",
|
||
"type": "courses_by_year",
|
||
"academic_year": "2025/2026",
|
||
"courses": [{"title": "Course", "url": "https://example.test/course"}],
|
||
},
|
||
{
|
||
"title": "ВКР",
|
||
"type": "graduation_theses",
|
||
"theses_count": 1,
|
||
"theses": [
|
||
{
|
||
"student": "Student Name",
|
||
"title": "Thesis title",
|
||
"defense_year": 2025,
|
||
"project_url": "https://www.hse.ru/edu/vkr/1",
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"title": "Fallback",
|
||
"type": "generic",
|
||
"raw_text": "Fallback text",
|
||
},
|
||
{
|
||
"title": "В новостях",
|
||
"type": "news",
|
||
"news_links": [
|
||
{
|
||
"title": "News title",
|
||
"url": "https://example.test/news",
|
||
"summary": "News summary",
|
||
"published_at": "2026-04-28T00:00:00+00:00",
|
||
"published_year": 2026,
|
||
}
|
||
],
|
||
},
|
||
],
|
||
},
|
||
)
|
||
|
||
payload = employee_detail_payload(employee)
|
||
|
||
assert payload["contacts"]["emails"] == ["person@hse.ru"]
|
||
assert payload["contacts"]["contact_items"] == ["consultation hours"]
|
||
assert payload["external_ids"][0]["system"] == "ORCID"
|
||
assert payload["sections"][0]["year_entries"][0]["text"] == "Master degree"
|
||
assert payload["sections"][1]["publications"][0]["title"] == "Paper"
|
||
assert payload["sections"][2]["courses"][0]["title"] == "Course"
|
||
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
|
||
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
|
||
assert payload["sections"][5]["news_links"][0]["title"] == "News title"
|
||
assert payload["news_links"][0]["published_display"] == "28.04.2026"
|
||
|
||
|
||
def test_employee_payload_prefers_stored_news_links(db_session):
|
||
employee = Employee(
|
||
profile_key="staff:news",
|
||
canonical_url="https://www.hse.ru/staff/news",
|
||
full_name="News Person",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
current_data={"sections": [{"type": "news", "news_links": [{"title": "Old news"}]}]},
|
||
)
|
||
db_session.add(employee)
|
||
db_session.commit()
|
||
db_session.add(
|
||
EmployeeNewsLink(
|
||
employee_id=employee.id,
|
||
title="Stored news",
|
||
url="https://example.test/stored",
|
||
summary="Stored summary",
|
||
published_at=datetime(2026, 4, 28, tzinfo=timezone.utc),
|
||
published_year=2026,
|
||
source_hash="b" * 64,
|
||
)
|
||
)
|
||
db_session.commit()
|
||
|
||
display = employee_display_payload(employee)
|
||
detail = employee_detail_payload(employee)
|
||
|
||
assert display["news_count"] == 1
|
||
assert detail["news_links"][0]["title"] == "Stored news"
|
||
assert detail["news_links"][0]["published_display"] == "28.04.2026"
|
||
|
||
|
||
def test_employee_payloads_tolerate_malformed_current_data(db_session):
|
||
employee = Employee(
|
||
profile_key="staff:broken",
|
||
canonical_url="https://www.hse.ru/staff/broken",
|
||
full_name="Broken Data",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
current_data="not-a-dict",
|
||
)
|
||
|
||
display = employee_display_payload(employee)
|
||
detail = employee_detail_payload(employee)
|
||
|
||
assert display["positions"] == []
|
||
assert display["email_text"] == ""
|
||
assert detail["contacts"]["emails"] == []
|
||
assert detail["contacts"]["contact_items"] == []
|
||
assert detail["sections"] == []
|
||
|
||
|
||
def test_list_employees_page_filters_sorts_and_paginates(db_session):
|
||
db_session.add(
|
||
Employee(
|
||
profile_key="staff:b",
|
||
canonical_url="https://www.hse.ru/staff/b",
|
||
full_name="Beta",
|
||
status="dismissed",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
current_data={"contacts": {"emails": []}},
|
||
)
|
||
)
|
||
db_session.add(
|
||
Employee(
|
||
profile_key="staff:a",
|
||
canonical_url="https://www.hse.ru/staff/a",
|
||
full_name="Alpha",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
current_data={"contacts": {"emails": ["alpha@hse.ru"]}},
|
||
)
|
||
)
|
||
db_session.commit()
|
||
|
||
page = list_employees_page(db_session, status="active", sort="full_name", direction="asc", limit=10)
|
||
|
||
assert page["total"] == 1
|
||
assert page["employees"][0]["full_name"] == "Alpha"
|
||
assert page["limit"] == 50
|
||
|
||
|
||
def test_stats_payload_uses_latest_run_new_count(db_session):
|
||
db_session.add(
|
||
Employee(
|
||
profile_key="staff:a",
|
||
canonical_url="https://www.hse.ru/staff/a",
|
||
full_name="Alpha",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
)
|
||
)
|
||
db_session.add(CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=3))
|
||
db_session.commit()
|
||
|
||
payload = stats_payload(db_session)
|
||
|
||
assert payload["total"] == 1
|
||
assert payload["active"] == 1
|
||
assert payload["new_in_last_run"] == 3
|
||
|
||
|
||
def test_run_payload_calculates_progress():
|
||
run = CrawlRun(
|
||
source_url="https://miem.hse.ru/persons",
|
||
status="running",
|
||
found_count=10,
|
||
parsed_count=4,
|
||
skipped_count=2,
|
||
error_count=1,
|
||
)
|
||
|
||
payload = run_payload(run)
|
||
|
||
assert payload["processed_count"] == 7
|
||
assert payload["progress_percent"] == 70.0
|
||
assert payload["status_display"] == "Выполняется"
|
||
|
||
|
||
def test_run_detail_payload_groups_changes_and_handles_old_runs(db_session):
|
||
old_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed")
|
||
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="completed", new_count=1)
|
||
employee = Employee(
|
||
profile_key="staff:new",
|
||
canonical_url="https://www.hse.ru/staff/new",
|
||
full_name="New Person",
|
||
status="active",
|
||
first_seen_at=datetime.now(timezone.utc),
|
||
last_seen_at=datetime.now(timezone.utc),
|
||
)
|
||
db_session.add_all([old_run, run, employee])
|
||
db_session.commit()
|
||
db_session.add(
|
||
CrawlRunEmployeeChange(
|
||
crawl_run_id=run.id,
|
||
employee_id=employee.id,
|
||
profile_key=employee.profile_key,
|
||
profile_url=employee.canonical_url,
|
||
full_name=employee.full_name,
|
||
change_type="new",
|
||
profile_available=True,
|
||
message="added",
|
||
)
|
||
)
|
||
db_session.add(
|
||
CrawlError(crawl_run_id=run.id, profile_url=employee.canonical_url, error_type="ValueError", message="bad")
|
||
)
|
||
db_session.commit()
|
||
|
||
payload = run_detail_payload(db_session, run)
|
||
old_payload = run_detail_payload(db_session, old_run)
|
||
|
||
assert payload["changes_detail_available"] is True
|
||
assert payload["changes"]["new"][0]["full_name"] == "New Person"
|
||
assert payload["errors"][0]["error_type"] == "ValueError"
|
||
assert old_payload["changes_detail_available"] is False
|
||
assert old_payload["changes"]["new"] == []
|