feat: add employee news links parsing and storage

This commit is contained in:
Anton
2026-05-22 18:50:25 +03:00
parent 680ac6e980
commit 4d2a071ec0
19 changed files with 636 additions and 16 deletions

View File

@@ -1,6 +1,6 @@
from datetime import datetime, timezone
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeNewsLink
from app.services.admin_data import (
employee_detail_payload,
employee_display_payload,
@@ -35,6 +35,7 @@ def test_employee_display_payload_extracts_common_fields(db_session):
"sections": [
{"type": "publications", "publications": [{"title": "Paper"}]},
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
{"type": "news", "news_links": [{"title": "News", "url": "https://example.test/news"}]},
],
},
)
@@ -46,6 +47,7 @@ def test_employee_display_payload_extracts_common_fields(db_session):
assert payload["email_text"] == "person@hse.ru"
assert payload["publications_count"] == 1
assert payload["courses_count"] == 1
assert payload["news_count"] == 1
assert payload["first_seen_display"] != "Не указано"
@@ -104,6 +106,19 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
"type": "generic",
"raw_text": "Fallback text",
},
{
"title": "В новостях",
"type": "news",
"news_links": [
{
"title": "News title",
"url": "https://example.test/news",
"summary": "News summary",
"published_at": "2026-04-28T00:00:00+00:00",
"published_year": 2026,
}
],
},
],
},
)
@@ -118,6 +133,41 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
assert payload["sections"][2]["courses"][0]["title"] == "Course"
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
assert payload["sections"][5]["news_links"][0]["title"] == "News title"
assert payload["news_links"][0]["published_display"] == "28.04.2026"
def test_employee_payload_prefers_stored_news_links(db_session):
employee = Employee(
profile_key="staff:news",
canonical_url="https://www.hse.ru/staff/news",
full_name="News Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"sections": [{"type": "news", "news_links": [{"title": "Old news"}]}]},
)
db_session.add(employee)
db_session.commit()
db_session.add(
EmployeeNewsLink(
employee_id=employee.id,
title="Stored news",
url="https://example.test/stored",
summary="Stored summary",
published_at=datetime(2026, 4, 28, tzinfo=timezone.utc),
published_year=2026,
source_hash="b" * 64,
)
)
db_session.commit()
display = employee_display_payload(employee)
detail = employee_detail_payload(employee)
assert display["news_count"] == 1
assert detail["news_links"][0]["title"] == "Stored news"
assert detail["news_links"][0]["published_display"] == "28.04.2026"
def test_employee_payloads_tolerate_malformed_current_data(db_session):