feat: add employee news links parsing and storage

This commit is contained in:
Anton
2026-05-22 18:50:25 +03:00
parent 680ac6e980
commit 4d2a071ec0
19 changed files with 636 additions and 16 deletions

View File

@@ -1,6 +1,6 @@
from datetime import datetime, timezone
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeNewsLink
from app.services.admin_data import (
employee_detail_payload,
employee_display_payload,
@@ -35,6 +35,7 @@ def test_employee_display_payload_extracts_common_fields(db_session):
"sections": [
{"type": "publications", "publications": [{"title": "Paper"}]},
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
{"type": "news", "news_links": [{"title": "News", "url": "https://example.test/news"}]},
],
},
)
@@ -46,6 +47,7 @@ def test_employee_display_payload_extracts_common_fields(db_session):
assert payload["email_text"] == "person@hse.ru"
assert payload["publications_count"] == 1
assert payload["courses_count"] == 1
assert payload["news_count"] == 1
assert payload["first_seen_display"] != "Не указано"
@@ -104,6 +106,19 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
"type": "generic",
"raw_text": "Fallback text",
},
{
"title": "В новостях",
"type": "news",
"news_links": [
{
"title": "News title",
"url": "https://example.test/news",
"summary": "News summary",
"published_at": "2026-04-28T00:00:00+00:00",
"published_year": 2026,
}
],
},
],
},
)
@@ -118,6 +133,41 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
assert payload["sections"][2]["courses"][0]["title"] == "Course"
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
assert payload["sections"][5]["news_links"][0]["title"] == "News title"
assert payload["news_links"][0]["published_display"] == "28.04.2026"
def test_employee_payload_prefers_stored_news_links(db_session):
employee = Employee(
profile_key="staff:news",
canonical_url="https://www.hse.ru/staff/news",
full_name="News Person",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"sections": [{"type": "news", "news_links": [{"title": "Old news"}]}]},
)
db_session.add(employee)
db_session.commit()
db_session.add(
EmployeeNewsLink(
employee_id=employee.id,
title="Stored news",
url="https://example.test/stored",
summary="Stored summary",
published_at=datetime(2026, 4, 28, tzinfo=timezone.utc),
published_year=2026,
source_hash="b" * 64,
)
)
db_session.commit()
display = employee_display_payload(employee)
detail = employee_detail_payload(employee)
assert display["news_count"] == 1
assert detail["news_links"][0]["title"] == "Stored news"
assert detail["news_links"][0]["published_display"] == "28.04.2026"
def test_employee_payloads_tolerate_malformed_current_data(db_session):

View File

@@ -22,6 +22,8 @@ def test_directory_template_is_russian_and_uses_display_dates():
assert "На странице: {{ value }}" in template
assert "{% for value in [25, 50, 100] %}" in template
assert "Найдено:" in template
assert "Новости" in template
assert "employee.news_count" in template
assert "employee.first_seen_display" in template
assert "employee.last_seen_display" in template
assert "employee.dismissed_display" in template

View File

@@ -20,7 +20,7 @@ def test_health_returns_versions():
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.6.2"
assert response.json()["backend_version"] == "0.7.0"
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
@@ -154,7 +154,7 @@ def test_mcp_service_info_returns_tools_and_dataset_hash():
assert response.status_code == 200
payload = json.loads(response.json()["result"]["content"][0]["text"])
assert payload["service_name"] == "miem-employees"
assert payload["backend_version"] == "0.6.2"
assert payload["backend_version"] == "0.7.0"
assert payload["dataset"]["hash"]
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])

View File

@@ -1,7 +1,16 @@
import gzip
from datetime import datetime, timezone
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication, EmployeeSnapshot, ParseResourceCache
from app.models import (
CrawlError,
CrawlRun,
CrawlRunEmployeeChange,
Employee,
EmployeeNewsLink,
EmployeePublication,
EmployeeSnapshot,
ParseResourceCache,
)
from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee
from app.services.resource_cache import ResourceCache
@@ -253,6 +262,44 @@ def test_upsert_employee_records_publication_errors_without_failing_employee(mon
assert "публикации" in error.message.lower()
def test_upsert_employee_saves_news_links_and_reuses_existing_rows(db_session):
first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add_all([first_run, second_run])
db_session.commit()
employee, _ = _upsert_employee(db_session, first_run, _parsed_employee_with_news("news-person"))
db_session.commit()
_upsert_employee(db_session, second_run, _parsed_employee_with_news("news-person"))
db_session.commit()
news_links = db_session.query(EmployeeNewsLink).filter_by(employee_id=employee.id).all()
assert len(news_links) == 1
assert news_links[0].title == "News Title"
assert news_links[0].url == "https://www.hse.ru/news/1.html"
assert news_links[0].published_year == 2026
def test_upsert_employee_records_news_errors_without_failing_employee(monkeypatch, db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
def broken_sync(*_args, **_kwargs):
raise RuntimeError("boom")
monkeypatch.setattr("app.services.crawler._sync_employee_news_links", broken_sync)
employee, changed = _upsert_employee(db_session, run, _parsed_employee_with_news("news-error-safe"))
db_session.commit()
assert changed is True
assert employee.full_name == "Same Person"
assert db_session.query(Employee).filter_by(profile_key="staff:news-error-safe").one()
error = db_session.query(CrawlError).one()
assert "новости" in error.message.lower()
def test_checksum_changes_when_widget_data_changes():
base = _parsed_employee("widgets")
changed = _parsed_employee("widgets")
@@ -314,3 +361,23 @@ def _parsed_employee_with_publication(profile_id: str) -> dict:
}
]
return parsed
def _parsed_employee_with_news(profile_id: str) -> dict:
parsed = _parsed_employee(profile_id)
parsed["sections"] = [
{
"type": "news",
"news_links": [
{
"title": "News Title",
"url": "https://www.hse.ru/news/1.html",
"summary": "News summary",
"published_at": "2026-04-28T00:00:00+00:00",
"published_year": 2026,
"raw_data": {"title": "News Title", "url": "https://www.hse.ru/news/1.html"},
}
],
}
]
return parsed

View File

@@ -69,3 +69,47 @@ def test_runtime_schema_creates_employee_publications_table_when_employees_exist
assert "employee_publications" in inspector.get_table_names()
columns = {column["name"] for column in inspector.get_columns("employee_publications")}
assert {"employee_id", "publication_id", "doi_url", "authors", "raw_data", "source_hash"}.issubset(columns)
def test_runtime_schema_creates_employee_news_links_table_when_employees_exist(monkeypatch):
engine = create_engine("sqlite:///:memory:")
with engine.begin() as connection:
connection.execute(
text(
"""
CREATE TABLE employees (
id INTEGER PRIMARY KEY,
profile_key VARCHAR(255) NOT NULL UNIQUE,
canonical_url TEXT NOT NULL,
status VARCHAR(32) NOT NULL DEFAULT 'active',
first_seen_at DATETIME NOT NULL,
last_seen_at DATETIME NOT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
)
"""
)
)
connection.execute(
text(
"""
CREATE TABLE crawl_runs (
id INTEGER PRIMARY KEY,
source_url TEXT NOT NULL,
status VARCHAR(32) NOT NULL DEFAULT 'running',
found_count INTEGER NOT NULL DEFAULT 0,
parsed_count INTEGER NOT NULL DEFAULT 0,
skipped_count INTEGER NOT NULL DEFAULT 0
)
"""
)
)
monkeypatch.setattr("app.db.engine", engine)
_ensure_runtime_schema()
_ensure_runtime_schema()
inspector = inspect(engine)
assert "employee_news_links" in inspector.get_table_names()
columns = {column["name"] for column in inspector.get_columns("employee_news_links")}
assert {"employee_id", "title", "url", "summary", "published_at", "published_year", "source_hash", "raw_data"}.issubset(columns)

View File

@@ -13,6 +13,9 @@ def test_employee_detail_template_is_human_readable():
assert "section.list_items" in template
assert "Основная информация" in template
assert "Контакты" in template
assert "В новостях" in template
assert "employee_view.news_links" in template
assert "news.summary" in template
assert "Разделы профиля" in template
assert "graduation_theses" in template
assert "Год защиты" in template

View File

@@ -232,3 +232,45 @@ def test_news_heading_with_publications_word_does_not_absorb_widget_publications
assert len(publications) == 1
assert publications[0]["title"] == "Публикации и исследования"
assert publications[0]["publications_count"] == 1
def test_extract_sections_parses_employee_news_links():
soup = BeautifulSoup(
"""
<div class="b-person-data posts hidden printable" data-tab="press_links_news" tab-node="press_links_news">
<div class="post f8">
<div class="post__extra">
<div class="post-meta">
<div class="post-meta__date">
<div class="post-meta__day">28</div>
<div class="post-meta__month">апр.</div>
<div class="post-meta__year">2026</div>
</div>
</div>
</div>
<div class="post__content">
<h2 class="first_child"><a class="link" href="/news/edu/1153850518.html">Как финал ВсОШ формирует кадры</a></h2>
<div class="post__text"><p class="with-indent">Краткое описание новости.</p></div>
</div>
</div>
<div class="post f8">
<div class="post__content">
<h2><a href="https://miem.hse.ru/news/1123589375.html">Партнер магистратуры</a></h2>
</div>
</div>
</div>
""",
"html.parser",
)
sections = extract_sections(soup, "https://www.hse.ru/staff/avsergeev")
assert len(sections) == 1
news = sections[0]
assert news["type"] == "news"
assert news["news_count"] == 2
assert news["news_links"][0]["title"] == "Как финал ВсОШ формирует кадры"
assert news["news_links"][0]["url"] == "https://www.hse.ru/news/edu/1153850518.html"
assert news["news_links"][0]["summary"] == "Краткое описание новости."
assert news["news_links"][0]["published_at"] == "2026-04-28T00:00:00+00:00"
assert news["news_links"][0]["published_year"] == 2026