feat: add employee news links parsing and storage
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee
|
||||
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeNewsLink
|
||||
from app.services.admin_data import (
|
||||
employee_detail_payload,
|
||||
employee_display_payload,
|
||||
@@ -35,6 +35,7 @@ def test_employee_display_payload_extracts_common_fields(db_session):
|
||||
"sections": [
|
||||
{"type": "publications", "publications": [{"title": "Paper"}]},
|
||||
{"type": "courses_by_year", "courses": [{"title": "Course"}]},
|
||||
{"type": "news", "news_links": [{"title": "News", "url": "https://example.test/news"}]},
|
||||
],
|
||||
},
|
||||
)
|
||||
@@ -46,6 +47,7 @@ def test_employee_display_payload_extracts_common_fields(db_session):
|
||||
assert payload["email_text"] == "person@hse.ru"
|
||||
assert payload["publications_count"] == 1
|
||||
assert payload["courses_count"] == 1
|
||||
assert payload["news_count"] == 1
|
||||
assert payload["first_seen_display"] != "Не указано"
|
||||
|
||||
|
||||
@@ -104,6 +106,19 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
|
||||
"type": "generic",
|
||||
"raw_text": "Fallback text",
|
||||
},
|
||||
{
|
||||
"title": "В новостях",
|
||||
"type": "news",
|
||||
"news_links": [
|
||||
{
|
||||
"title": "News title",
|
||||
"url": "https://example.test/news",
|
||||
"summary": "News summary",
|
||||
"published_at": "2026-04-28T00:00:00+00:00",
|
||||
"published_year": 2026,
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
@@ -118,6 +133,41 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session):
|
||||
assert payload["sections"][2]["courses"][0]["title"] == "Course"
|
||||
assert payload["sections"][3]["theses"][0]["student"] == "Student Name"
|
||||
assert payload["sections"][4]["paragraphs"] == ["Fallback text"]
|
||||
assert payload["sections"][5]["news_links"][0]["title"] == "News title"
|
||||
assert payload["news_links"][0]["published_display"] == "28.04.2026"
|
||||
|
||||
|
||||
def test_employee_payload_prefers_stored_news_links(db_session):
|
||||
employee = Employee(
|
||||
profile_key="staff:news",
|
||||
canonical_url="https://www.hse.ru/staff/news",
|
||||
full_name="News Person",
|
||||
status="active",
|
||||
first_seen_at=datetime.now(timezone.utc),
|
||||
last_seen_at=datetime.now(timezone.utc),
|
||||
current_data={"sections": [{"type": "news", "news_links": [{"title": "Old news"}]}]},
|
||||
)
|
||||
db_session.add(employee)
|
||||
db_session.commit()
|
||||
db_session.add(
|
||||
EmployeeNewsLink(
|
||||
employee_id=employee.id,
|
||||
title="Stored news",
|
||||
url="https://example.test/stored",
|
||||
summary="Stored summary",
|
||||
published_at=datetime(2026, 4, 28, tzinfo=timezone.utc),
|
||||
published_year=2026,
|
||||
source_hash="b" * 64,
|
||||
)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
display = employee_display_payload(employee)
|
||||
detail = employee_detail_payload(employee)
|
||||
|
||||
assert display["news_count"] == 1
|
||||
assert detail["news_links"][0]["title"] == "Stored news"
|
||||
assert detail["news_links"][0]["published_display"] == "28.04.2026"
|
||||
|
||||
|
||||
def test_employee_payloads_tolerate_malformed_current_data(db_session):
|
||||
|
||||
@@ -22,6 +22,8 @@ def test_directory_template_is_russian_and_uses_display_dates():
|
||||
assert "На странице: {{ value }}" in template
|
||||
assert "{% for value in [25, 50, 100] %}" in template
|
||||
assert "Найдено:" in template
|
||||
assert "Новости" in template
|
||||
assert "employee.news_count" in template
|
||||
assert "employee.first_seen_display" in template
|
||||
assert "employee.last_seen_display" in template
|
||||
assert "employee.dismissed_display" in template
|
||||
|
||||
@@ -20,7 +20,7 @@ def test_health_returns_versions():
|
||||
response = client.get("/api/health")
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()["backend_version"] == "0.6.2"
|
||||
assert response.json()["backend_version"] == "0.7.0"
|
||||
|
||||
|
||||
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
|
||||
@@ -154,7 +154,7 @@ def test_mcp_service_info_returns_tools_and_dataset_hash():
|
||||
assert response.status_code == 200
|
||||
payload = json.loads(response.json()["result"]["content"][0]["text"])
|
||||
assert payload["service_name"] == "miem-employees"
|
||||
assert payload["backend_version"] == "0.6.2"
|
||||
assert payload["backend_version"] == "0.7.0"
|
||||
assert payload["dataset"]["hash"]
|
||||
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])
|
||||
|
||||
|
||||
@@ -1,7 +1,16 @@
|
||||
import gzip
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication, EmployeeSnapshot, ParseResourceCache
|
||||
from app.models import (
|
||||
CrawlError,
|
||||
CrawlRun,
|
||||
CrawlRunEmployeeChange,
|
||||
Employee,
|
||||
EmployeeNewsLink,
|
||||
EmployeePublication,
|
||||
EmployeeSnapshot,
|
||||
ParseResourceCache,
|
||||
)
|
||||
from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee
|
||||
from app.services.resource_cache import ResourceCache
|
||||
|
||||
@@ -253,6 +262,44 @@ def test_upsert_employee_records_publication_errors_without_failing_employee(mon
|
||||
assert "публикации" in error.message.lower()
|
||||
|
||||
|
||||
def test_upsert_employee_saves_news_links_and_reuses_existing_rows(db_session):
|
||||
first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
db_session.add_all([first_run, second_run])
|
||||
db_session.commit()
|
||||
|
||||
employee, _ = _upsert_employee(db_session, first_run, _parsed_employee_with_news("news-person"))
|
||||
db_session.commit()
|
||||
_upsert_employee(db_session, second_run, _parsed_employee_with_news("news-person"))
|
||||
db_session.commit()
|
||||
|
||||
news_links = db_session.query(EmployeeNewsLink).filter_by(employee_id=employee.id).all()
|
||||
assert len(news_links) == 1
|
||||
assert news_links[0].title == "News Title"
|
||||
assert news_links[0].url == "https://www.hse.ru/news/1.html"
|
||||
assert news_links[0].published_year == 2026
|
||||
|
||||
|
||||
def test_upsert_employee_records_news_errors_without_failing_employee(monkeypatch, db_session):
|
||||
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
db_session.add(run)
|
||||
db_session.commit()
|
||||
|
||||
def broken_sync(*_args, **_kwargs):
|
||||
raise RuntimeError("boom")
|
||||
|
||||
monkeypatch.setattr("app.services.crawler._sync_employee_news_links", broken_sync)
|
||||
|
||||
employee, changed = _upsert_employee(db_session, run, _parsed_employee_with_news("news-error-safe"))
|
||||
db_session.commit()
|
||||
|
||||
assert changed is True
|
||||
assert employee.full_name == "Same Person"
|
||||
assert db_session.query(Employee).filter_by(profile_key="staff:news-error-safe").one()
|
||||
error = db_session.query(CrawlError).one()
|
||||
assert "новости" in error.message.lower()
|
||||
|
||||
|
||||
def test_checksum_changes_when_widget_data_changes():
|
||||
base = _parsed_employee("widgets")
|
||||
changed = _parsed_employee("widgets")
|
||||
@@ -314,3 +361,23 @@ def _parsed_employee_with_publication(profile_id: str) -> dict:
|
||||
}
|
||||
]
|
||||
return parsed
|
||||
|
||||
|
||||
def _parsed_employee_with_news(profile_id: str) -> dict:
|
||||
parsed = _parsed_employee(profile_id)
|
||||
parsed["sections"] = [
|
||||
{
|
||||
"type": "news",
|
||||
"news_links": [
|
||||
{
|
||||
"title": "News Title",
|
||||
"url": "https://www.hse.ru/news/1.html",
|
||||
"summary": "News summary",
|
||||
"published_at": "2026-04-28T00:00:00+00:00",
|
||||
"published_year": 2026,
|
||||
"raw_data": {"title": "News Title", "url": "https://www.hse.ru/news/1.html"},
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
return parsed
|
||||
|
||||
@@ -69,3 +69,47 @@ def test_runtime_schema_creates_employee_publications_table_when_employees_exist
|
||||
assert "employee_publications" in inspector.get_table_names()
|
||||
columns = {column["name"] for column in inspector.get_columns("employee_publications")}
|
||||
assert {"employee_id", "publication_id", "doi_url", "authors", "raw_data", "source_hash"}.issubset(columns)
|
||||
|
||||
|
||||
def test_runtime_schema_creates_employee_news_links_table_when_employees_exist(monkeypatch):
|
||||
engine = create_engine("sqlite:///:memory:")
|
||||
with engine.begin() as connection:
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE employees (
|
||||
id INTEGER PRIMARY KEY,
|
||||
profile_key VARCHAR(255) NOT NULL UNIQUE,
|
||||
canonical_url TEXT NOT NULL,
|
||||
status VARCHAR(32) NOT NULL DEFAULT 'active',
|
||||
first_seen_at DATETIME NOT NULL,
|
||||
last_seen_at DATETIME NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
updated_at DATETIME NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TABLE crawl_runs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_url TEXT NOT NULL,
|
||||
status VARCHAR(32) NOT NULL DEFAULT 'running',
|
||||
found_count INTEGER NOT NULL DEFAULT 0,
|
||||
parsed_count INTEGER NOT NULL DEFAULT 0,
|
||||
skipped_count INTEGER NOT NULL DEFAULT 0
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
monkeypatch.setattr("app.db.engine", engine)
|
||||
|
||||
_ensure_runtime_schema()
|
||||
_ensure_runtime_schema()
|
||||
|
||||
inspector = inspect(engine)
|
||||
assert "employee_news_links" in inspector.get_table_names()
|
||||
columns = {column["name"] for column in inspector.get_columns("employee_news_links")}
|
||||
assert {"employee_id", "title", "url", "summary", "published_at", "published_year", "source_hash", "raw_data"}.issubset(columns)
|
||||
|
||||
@@ -13,6 +13,9 @@ def test_employee_detail_template_is_human_readable():
|
||||
assert "section.list_items" in template
|
||||
assert "Основная информация" in template
|
||||
assert "Контакты" in template
|
||||
assert "В новостях" in template
|
||||
assert "employee_view.news_links" in template
|
||||
assert "news.summary" in template
|
||||
assert "Разделы профиля" in template
|
||||
assert "graduation_theses" in template
|
||||
assert "Год защиты" in template
|
||||
|
||||
@@ -232,3 +232,45 @@ def test_news_heading_with_publications_word_does_not_absorb_widget_publications
|
||||
assert len(publications) == 1
|
||||
assert publications[0]["title"] == "Публикации и исследования"
|
||||
assert publications[0]["publications_count"] == 1
|
||||
|
||||
|
||||
def test_extract_sections_parses_employee_news_links():
|
||||
soup = BeautifulSoup(
|
||||
"""
|
||||
<div class="b-person-data posts hidden printable" data-tab="press_links_news" tab-node="press_links_news">
|
||||
<div class="post f8">
|
||||
<div class="post__extra">
|
||||
<div class="post-meta">
|
||||
<div class="post-meta__date">
|
||||
<div class="post-meta__day">28</div>
|
||||
<div class="post-meta__month">апр.</div>
|
||||
<div class="post-meta__year">2026</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="post__content">
|
||||
<h2 class="first_child"><a class="link" href="/news/edu/1153850518.html">Как финал ВсОШ формирует кадры</a></h2>
|
||||
<div class="post__text"><p class="with-indent">Краткое описание новости.</p></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="post f8">
|
||||
<div class="post__content">
|
||||
<h2><a href="https://miem.hse.ru/news/1123589375.html">Партнер магистратуры</a></h2>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
""",
|
||||
"html.parser",
|
||||
)
|
||||
|
||||
sections = extract_sections(soup, "https://www.hse.ru/staff/avsergeev")
|
||||
|
||||
assert len(sections) == 1
|
||||
news = sections[0]
|
||||
assert news["type"] == "news"
|
||||
assert news["news_count"] == 2
|
||||
assert news["news_links"][0]["title"] == "Как финал ВсОШ формирует кадры"
|
||||
assert news["news_links"][0]["url"] == "https://www.hse.ru/news/edu/1153850518.html"
|
||||
assert news["news_links"][0]["summary"] == "Краткое описание новости."
|
||||
assert news["news_links"][0]["published_at"] == "2026-04-28T00:00:00+00:00"
|
||||
assert news["news_links"][0]["published_year"] == 2026
|
||||
|
||||
Reference in New Issue
Block a user