From 4d2a071ec088a073cd01b0bbee4a844ad9e0268a Mon Sep 17 00:00:00 2001 From: Anton Date: Fri, 22 May 2026 18:50:25 +0300 Subject: [PATCH] feat: add employee news links parsing and storage --- MCP_DESCRIPTION.md | 29 +++++- README.md | 14 ++- app/db.py | 4 + app/models.py | 27 ++++++ app/parser/profile.py | 118 +++++++++++++++++++++++++ app/services/admin_data.py | 80 ++++++++++++++++- app/services/crawler.py | 104 ++++++++++++++++++++++ app/templates/directory.html | 6 +- app/templates/employee_detail.html | 19 ++++ app/version.py | 6 +- migrations/007_employee_news_links.sql | 27 ++++++ pyproject.toml | 2 +- tests/test_admin_data.py | 52 ++++++++++- tests/test_admin_templates.py | 2 + tests/test_api_mcp.py | 4 +- tests/test_crawler.py | 69 ++++++++++++++- tests/test_db_schema.py | 44 +++++++++ tests/test_employee_detail_template.py | 3 + tests/test_parser.py | 42 +++++++++ 19 files changed, 636 insertions(+), 16 deletions(-) create mode 100644 migrations/007_employee_news_links.sql diff --git a/MCP_DESCRIPTION.md b/MCP_DESCRIPTION.md index cdaaa20..b2386b3 100644 --- a/MCP_DESCRIPTION.md +++ b/MCP_DESCRIPTION.md @@ -92,7 +92,7 @@ miem-employees "protocolVersion": "2024-11-05", "serverInfo": { "name": "miem-employees", - "version": "0.5.0" + "version": "0.7.0" }, "capabilities": { "tools": {} @@ -172,6 +172,7 @@ MCP читает данные из основной базы через SQLAlche - `employees`: текущая карточка сотрудника, статус, профиль, `current_data`, checksum. - `employee_publications`: нормализованные публикации сотрудников с авторами, DOI, аннотацией, описанием, citation text и raw JSON из HSE Publications. +- `employee_news_links`: нормализованные ссылки на новости из блока профиля «В новостях» с заголовком, URL, кратким описанием, датой, годом публикации и raw JSON карточки. - `crawl_runs`: история запусков парсинга. - `crawl_run_employee_changes`: детальные изменения сотрудников в рамках запуска. - `crawl_errors`: ошибки парсинга в рамках запуска. @@ -207,7 +208,29 @@ MCP читает данные из основной базы через SQLAlche } ``` -`data` соответствует распарсенному JSON профиля сотрудника. Внутри `sections` могут быть секции с публикациями, курсами, ВКР, таблицами, ссылками и произвольными текстовыми блоками. +`data` соответствует распарсенному JSON профиля сотрудника. Внутри `sections` могут быть секции с публикациями, курсами, ВКР, новостями, таблицами, ссылками и произвольными текстовыми блоками. + +Пример секции новостей внутри `data.sections`: + +```json +{ + "title": "В новостях", + "slug": "v_novostyah", + "type": "news", + "news_count": 1, + "news_links": [ + { + "title": "Название новости", + "url": "https://www.hse.ru/news/edu/1153850518.html", + "summary": "Краткое описание новости.", + "published_at": "2026-04-28T00:00:00+00:00", + "published_year": 2026 + } + ] +} +``` + +Для новостей отдельного MCP tool сейчас нет: они доступны через `get_employee(...).data.sections` или через полную синхронизацию `sync_employees(include_data=true)`. ## Tools @@ -222,7 +245,7 @@ MCP читает данные из основной базы через SQLAlche ```json { "service_name": "miem-employees", - "backend_version": "0.5.0", + "backend_version": "0.7.0", "protocolVersion": "2024-11-05", "tools": [], "dataset": { diff --git a/README.md b/README.md index 07091f6..094e265 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ - `mcp`: открытый HTTP MCP endpoint для ИИ-агентов. - `postgres`: основная БД. -Парсер использует фиксированный источник сотрудников, по умолчанию `https://miem.hse.ru/persons`. Для каждой карточки сохраняются ФИО, должности, год начала работы, контакты, идентификаторы, вкладки профиля, секции, публикации, курсы, ВКР, JSON-снапшот и сжатый HTML-снапшот. Детальные публикации дополнительно нормализуются в отдельную таблицу `employee_publications`. Ссылки обходятся только из меню профиля самого сотрудника (`person-menu`), например `#sci`, `#teaching`, `#main`. +Парсер использует фиксированный источник сотрудников, по умолчанию `https://miem.hse.ru/persons`. Для каждой карточки сохраняются ФИО, должности, год начала работы, контакты, идентификаторы, вкладки профиля, секции, публикации, курсы, ВКР, новости, JSON-снапшот и сжатый HTML-снапшот. Детальные публикации дополнительно нормализуются в отдельную таблицу `employee_publications`, а новости из блока «В новостях» — в `employee_news_links`. Ссылки обходятся только из меню профиля самого сотрудника (`person-menu`), например `#sci`, `#teaching`, `#main`. ## Переменные окружения @@ -73,6 +73,13 @@ docker compose up --build `list_employee_publications` сначала читает `employee_publications`; если детальных строк еще нет, возвращает старые публикации из `current_data`. +Новости сотрудников также хранятся в двух видах: + +- краткий список остается внутри `employees.current_data.sections[].news_links`; +- нормализованные карточки из вкладки «В новостях» сохраняются в `employee_news_links`. + +`employee_news_links` содержит название новости, ссылку, краткое описание, дату публикации, год публикации, raw JSON карточки и `source_hash`. Уникальность поддерживается по `(employee_id, url)` и `(employee_id, source_hash)`, поэтому повторный crawl не создает дубликаты. + ## Парсинг Weekly worker запускается по `CRAWL_CRON`. Ручной запуск доступен в админке на `Dashboard` и странице `Runs` или через REST: @@ -87,6 +94,7 @@ curl -X POST http://localhost:8000/api/crawl-runs --cookie "miem_admin_session=. - новые сотрудники добавляются в `employees`; - количество новых сотрудников за запуск сохраняется в `crawl_runs.new_count`; - публикации из HSE Publications записываются в `employee_publications`, а краткий список остается в JSON профиля; +- новости из блока «В новостях» записываются в `employee_news_links`, а краткий список остается в JSON профиля; - активные сотрудники, исчезнувшие из текущего списка источника, получают статус `dismissed` и `dismissed_at`; - каждый успешный новый или измененный разбор сохраняет запись в `employee_snapshots`; - неизмененные профили учитываются в `crawl_runs.skipped_count` и не получают новый snapshot. @@ -110,6 +118,8 @@ Endpoint: `POST /mcp`, без авторизации на уровне прил `get_service_info` возвращает метаданные сервиса, список tools и текущую версию набора сотрудников. `sync_employees` отдает полный snapshot или delta по `client_hash`; checksum набора строится по сотрудникам, их статусам и текущим checksums. Ответы tools возвращаются как JSON-строка внутри MCP `content[0].text`. +Новости сотрудника отдельной MCP tool не имеют: они доступны в `get_employee(...).data.sections` и `sync_employees(include_data=true)` как секция `type = "news"` с массивом `news_links`. + Пример локального запроса списка tools: ```bash @@ -129,4 +139,4 @@ docker compose exec postgres pg_dump -U miem miem_workers > backup.sql docker compose down ``` -Версия сервиса: `0.6.2`. Админка всегда показывает версии backend и frontend в footer. +Версия сервиса: `0.7.0`. Админка всегда показывает версии backend и frontend в footer. diff --git a/app/db.py b/app/db.py index 4d565da..095a666 100644 --- a/app/db.py +++ b/app/db.py @@ -37,6 +37,10 @@ def _ensure_runtime_schema() -> None: models.EmployeePublication.__table__.create(bind=engine, checkfirst=True) inspector = inspect(engine) table_names = set(inspector.get_table_names()) + if "employees" in table_names and "employee_news_links" not in table_names: + models.EmployeeNewsLink.__table__.create(bind=engine, checkfirst=True) + inspector = inspect(engine) + table_names = set(inspector.get_table_names()) if "crawl_runs" not in table_names: return crawl_run_columns = {column["name"] for column in inspector.get_columns("crawl_runs")} diff --git a/app/models.py b/app/models.py index 08dc5ba..1251612 100644 --- a/app/models.py +++ b/app/models.py @@ -42,6 +42,7 @@ class Employee(Base): snapshots: Mapped[list["EmployeeSnapshot"]] = relationship(back_populates="employee") tabs: Mapped[list["ProfileTab"]] = relationship(back_populates="employee", cascade="all, delete-orphan") publications: Mapped[list["EmployeePublication"]] = relationship(back_populates="employee", cascade="all, delete-orphan") + news_links: Mapped[list["EmployeeNewsLink"]] = relationship(back_populates="employee", cascade="all, delete-orphan") crawl_run_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="employee") @@ -97,6 +98,32 @@ class EmployeePublication(Base): employee: Mapped[Employee] = relationship(back_populates="publications") +class EmployeeNewsLink(Base): + __tablename__ = "employee_news_links" + __table_args__ = ( + UniqueConstraint("employee_id", "url", name="uq_employee_news_links_employee_url"), + UniqueConstraint("employee_id", "source_hash", name="uq_employee_news_links_employee_source_hash"), + Index("ix_employee_news_links_employee_id", "employee_id"), + Index("ix_employee_news_links_url", "url"), + Index("ix_employee_news_links_published_at", "published_at"), + Index("ix_employee_news_links_published_year", "published_year"), + ) + + id: Mapped[int] = mapped_column(Integer, primary_key=True) + employee_id: Mapped[int] = mapped_column(ForeignKey("employees.id", ondelete="CASCADE"), nullable=False) + title: Mapped[str] = mapped_column(Text, nullable=False) + url: Mapped[str | None] = mapped_column(Text) + summary: Mapped[str | None] = mapped_column(Text) + published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + published_year: Mapped[int | None] = mapped_column(Integer) + source_hash: Mapped[str] = mapped_column(String(64), nullable=False) + raw_data: Mapped[dict | None] = mapped_column(json_type) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow, nullable=False) + + employee: Mapped[Employee] = relationship(back_populates="news_links") + + class CrawlRun(Base): __tablename__ = "crawl_runs" diff --git a/app/parser/profile.py b/app/parser/profile.py index 338cf3e..17dab43 100644 --- a/app/parser/profile.py +++ b/app/parser/profile.py @@ -1,6 +1,7 @@ import hashlib import json import re +from datetime import datetime, timezone from urllib.parse import urljoin from bs4 import BeautifulSoup, NavigableString, Tag @@ -101,6 +102,8 @@ def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict: def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]: sections = [] for h2 in soup.select("h2"): + if h2.find_parent(class_="post") or h2.find_parent(attrs={"data-tab": "press_links_news"}): + continue title = normalize_ws(h2.get_text(" ", strip=True)) if not title or "расписание занятий" in title.lower(): continue @@ -142,6 +145,21 @@ def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]: if section_type in {"generic", "paragraphs"}: section["type"] = "year_blocks" sections.append(section) + news_links = _parse_news_links(soup, source_url) + if news_links: + sections.append( + { + "title": "В новостях", + "slug": "v_novostyah", + "type": "news", + "raw_text": "", + "paragraphs": [], + "items": [item["title"] for item in news_links if item.get("title")], + "links": [{"text": item["title"], "url": item["url"]} for item in news_links if item.get("title") and item.get("url")], + "news_count": len(news_links), + "news_links": news_links, + } + ) return sections @@ -575,6 +593,95 @@ def _parse_vkr_items(nodes: list) -> list[str]: return [item for item in dict.fromkeys(items) if item] +def _parse_news_links(soup: BeautifulSoup, source_url: str) -> list[dict]: + news = [] + for post in soup.select('[data-tab="press_links_news"] .post'): + if not isinstance(post, Tag): + continue + anchor = post.select_one(".post__content h2 a[href], h2 a[href], a[href]") + title = normalize_ws(anchor.get_text(" ", strip=True)) if anchor else "" + href = normalize_ws(anchor.get("href")) if anchor else "" + summary_node = post.select_one(".post__text") + summary = normalize_ws(summary_node.get_text(" ", strip=True)) if summary_node else "" + published_at = _parse_post_date(post) + if not title and not href: + continue + item = { + "title": title or href, + "url": urljoin(source_url, href) if href else None, + "summary": summary or None, + "published_at": published_at.isoformat() if published_at else None, + "published_year": published_at.year if published_at else _int_or_none(normalize_ws(_select_text(post, ".post-meta__year"))), + "raw_data": { + "title": title or href, + "url": href or None, + "summary": summary or None, + "date_text": normalize_ws(_select_text(post, ".post-meta__date")), + }, + } + news.append(item) + return _dedupe_news_links(news) + + +def _select_text(node: Tag, selector: str) -> str: + selected = node.select_one(selector) + return selected.get_text(" ", strip=True) if selected else "" + + +def _parse_post_date(post: Tag) -> datetime | None: + day = _int_or_none(normalize_ws(_select_text(post, ".post-meta__day"))) + month = _month_number(normalize_ws(_select_text(post, ".post-meta__month"))) + year = _int_or_none(normalize_ws(_select_text(post, ".post-meta__year"))) + if not day or not month or not year: + return None + try: + return datetime(year, month, day, tzinfo=timezone.utc) + except ValueError: + return None + + +def _month_number(value: str) -> int | None: + lowered = value.lower().strip(".") + months = { + "янв": 1, + "январь": 1, + "января": 1, + "фев": 2, + "февр": 2, + "февраль": 2, + "февраля": 2, + "март": 3, + "мар": 3, + "марта": 3, + "апр": 4, + "апрель": 4, + "апреля": 4, + "май": 5, + "мая": 5, + "июнь": 6, + "июня": 6, + "июль": 7, + "июля": 7, + "авг": 8, + "август": 8, + "августа": 8, + "сент": 9, + "сен": 9, + "сентябрь": 9, + "сентября": 9, + "окт": 10, + "октябрь": 10, + "октября": 10, + "нояб": 11, + "ноябрь": 11, + "ноября": 11, + "дек": 12, + "декабрь": 12, + "декабря": 12, + } + return months.get(lowered) + + def _normalize_publication_item(item: dict, current_author_id: str | None = None) -> dict: publication_id = str(item.get("id") or "").strip() title = _html_to_text(item.get("title")) @@ -698,6 +805,17 @@ def _dedupe_publications(items: list[dict]) -> list[dict]: return unique +def _dedupe_news_links(items: list[dict]) -> list[dict]: + seen = set() + unique = [] + for item in items: + key = item.get("url") or item.get("title") + if key and key not in seen: + seen.add(key) + unique.append(item) + return unique + + def _html_to_text(value: object) -> str: return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True)) diff --git a/app/services/admin_data.py b/app/services/admin_data.py index 5e30f3b..0149117 100644 --- a/app/services/admin_data.py +++ b/app/services/admin_data.py @@ -8,7 +8,7 @@ from zoneinfo import ZoneInfo from sqlalchemy import Select, Text, and_, desc, func, or_, select from sqlalchemy.orm import Session -from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee +from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeNewsLink EMPLOYEE_SORTS = { "full_name": Employee.full_name, @@ -24,6 +24,7 @@ def employee_display_payload(employee: Employee) -> dict[str, Any]: data = _as_dict(employee.current_data) contacts = _as_dict(data.get("contacts")) sections = _as_list(data.get("sections")) + stored_news_links = _stored_news_links(employee) positions = _clean_list(data.get("positions")) emails = _clean_list(contacts.get("emails")) phones = _clean_list(contacts.get("phones")) @@ -43,6 +44,7 @@ def employee_display_payload(employee: Employee) -> dict[str, Any]: "address": contacts.get("address"), "publications_count": _count_section_items(sections, "publications"), "courses_count": _count_section_items(sections, "courses_by_year"), + "news_count": len(stored_news_links) or _count_section_items(sections, "news"), "first_seen_at": employee.first_seen_at.isoformat() if employee.first_seen_at else None, "last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None, "dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None, @@ -67,6 +69,7 @@ def employee_detail_payload(employee: Employee) -> dict[str, Any]: "contact_items": _normalize_contact_items(contacts.get("items")), }, "external_ids": _normalize_external_ids(data.get("external_ids")), + "news_links": _detail_news_links(employee, data), "sections": [_normalize_section(section) for section in _as_list(data.get("sections"))], } @@ -276,6 +279,8 @@ def _count_section_items(sections: list[dict[str, Any]], section_type: str) -> i total += len(section.get("publications") or section.get("items") or []) elif section_type == "courses_by_year": total += len(section.get("courses") or []) + elif section_type == "news": + total += len(section.get("news_links") or section.get("items") or []) return total @@ -348,6 +353,8 @@ def _normalize_section(section: Any) -> dict[str, Any]: "year_entries": _normalize_year_entries(section.get("year_entries")), "publications": _normalize_publications(section.get("publications")), "publications_count": section.get("publications_count"), + "news_links": _normalize_news_links(section.get("news_links")), + "news_count": section.get("news_count"), "theses": _normalize_theses(section.get("theses")), "theses_count": section.get("theses_count"), "academic_year": section.get("academic_year"), @@ -370,6 +377,77 @@ def _normalize_links(items: Any) -> list[dict[str, str | None]]: return normalized +def _stored_news_links(employee: Employee) -> list[dict[str, Any]]: + return [_stored_news_link_payload(item) for item in sorted(employee.news_links, key=_news_link_sort_key)] + + +def _news_link_sort_key(item: EmployeeNewsLink) -> tuple: + timestamp = item.published_at.timestamp() if item.published_at else 0 + return (-timestamp, item.title or "", item.id) + + +def _stored_news_link_payload(item: EmployeeNewsLink) -> dict[str, Any]: + return { + "title": item.title, + "url": item.url, + "summary": item.summary, + "published_at": item.published_at.isoformat() if item.published_at else None, + "published_year": item.published_year, + "published_display": format_admin_date(item.published_at) if item.published_at else str(item.published_year or ""), + } + + +def _detail_news_links(employee: Employee, data: dict[str, Any]) -> list[dict[str, Any]]: + stored = _stored_news_links(employee) + if stored: + return stored + for section in _as_list(data.get("sections")): + if isinstance(section, dict) and section.get("type") == "news": + return _normalize_news_links(section.get("news_links")) + return [] + + +def format_admin_date(value: Any) -> str: + if not value: + return "" + if isinstance(value, str): + try: + value = datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError: + return value + if not isinstance(value, datetime): + return str(value) + if value.tzinfo: + value = value.astimezone(ZoneInfo("Europe/Moscow")) + return value.strftime("%d.%m.%Y") + + +def _normalize_news_links(items: Any) -> list[dict[str, Any]]: + normalized = [] + if not isinstance(items, list): + return normalized + for item in items: + if not isinstance(item, dict): + continue + title = str(item.get("title") or item.get("url") or "").strip() + url = str(item.get("url") or "").strip() + summary = str(item.get("summary") or "").strip() + published_at = str(item.get("published_at") or "").strip() + published_year = item.get("published_year") + if title or url: + normalized.append( + { + "title": title or url, + "url": url or None, + "summary": summary or None, + "published_at": published_at or None, + "published_year": published_year, + "published_display": format_admin_date(published_at) if published_at else str(published_year or ""), + } + ) + return normalized + + def _normalize_year_entries(items: Any) -> list[dict[str, Any]]: normalized = [] if not isinstance(items, list): diff --git a/app/services/crawler.py b/app/services/crawler.py index 2ce95c0..1a11ea7 100644 --- a/app/services/crawler.py +++ b/app/services/crawler.py @@ -15,6 +15,7 @@ from app.models import ( CrawlRun, CrawlRunEmployeeChange, Employee, + EmployeeNewsLink, EmployeePublication, EmployeeSnapshot, ParserSource, @@ -230,6 +231,7 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> tuple[Employee ) db.flush() _try_sync_employee_publications(db, run, employee, parsed) + _try_sync_employee_news_links(db, run, employee, parsed) return employee, changed @@ -349,6 +351,108 @@ def _int_or_none(value: object) -> int | None: return None +def _try_sync_employee_news_links(db: Session, run: CrawlRun, employee: Employee, parsed: dict) -> None: + try: + if not _news_link_payloads(parsed): + return + if not _employee_news_links_table_exists(db): + return + with db.begin_nested(): + _sync_employee_news_links(db, employee, parsed) + except Exception as exc: + db.add( + CrawlError( + crawl_run_id=run.id, + profile_url=employee.canonical_url, + error_type=type(exc).__name__, + message=f"Не удалось сохранить новости сотрудника: {exc}", + ) + ) + + +def _employee_news_links_table_exists(db: Session) -> bool: + return inspect(db.connection()).has_table(EmployeeNewsLink.__tablename__) + + +def _sync_employee_news_links(db: Session, employee: Employee, parsed: dict) -> None: + news_links = _news_link_payloads(parsed) + seen_hashes = set() + for news_link in news_links: + source_hash = _news_link_hash(news_link) + seen_hashes.add(source_hash) + url = _clean_optional(news_link.get("url")) + existing = None + if url: + existing = db.scalar( + select(EmployeeNewsLink).where( + EmployeeNewsLink.employee_id == employee.id, + EmployeeNewsLink.url == url, + ) + ) + if not existing: + existing = db.scalar( + select(EmployeeNewsLink).where( + EmployeeNewsLink.employee_id == employee.id, + EmployeeNewsLink.source_hash == source_hash, + ) + ) + if not existing: + existing = EmployeeNewsLink(employee_id=employee.id, source_hash=source_hash, title=_news_link_title(news_link)) + db.add(existing) + _apply_news_link(existing, news_link, source_hash) + + if seen_hashes: + stale = db.scalars( + select(EmployeeNewsLink).where( + EmployeeNewsLink.employee_id == employee.id, + EmployeeNewsLink.source_hash.not_in(seen_hashes), + ) + ).all() + for item in stale: + db.delete(item) + + +def _news_link_payloads(parsed: dict) -> list[dict]: + news_links = [] + for section in parsed.get("sections") or []: + if not isinstance(section, dict) or section.get("type") != "news": + continue + for item in section.get("news_links") or []: + if isinstance(item, dict): + news_links.append(item) + return news_links + + +def _apply_news_link(target: EmployeeNewsLink, news_link: dict, source_hash: str) -> None: + target.title = _news_link_title(news_link) + target.url = _clean_optional(news_link.get("url")) + target.summary = _clean_optional(news_link.get("summary")) + target.published_at = _datetime_or_none(news_link.get("published_at")) + target.published_year = _int_or_none(news_link.get("published_year")) + target.raw_data = news_link.get("raw_data") if isinstance(news_link.get("raw_data"), dict) else news_link + target.source_hash = source_hash + + +def _news_link_hash(news_link: dict) -> str: + return _payload_hash(news_link.get("raw_data") if isinstance(news_link.get("raw_data"), dict) else news_link) + + +def _news_link_title(news_link: dict) -> str: + return _clean_optional(news_link.get("title") or news_link.get("url")) or "Untitled news" + + +def _datetime_or_none(value: object) -> datetime | None: + if isinstance(value, datetime): + return value + if not value: + return None + try: + parsed = datetime.fromisoformat(str(value).replace("Z", "+00:00")) + except ValueError: + return None + return parsed if parsed.tzinfo else parsed.replace(tzinfo=timezone.utc) + + def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int: dismissed = 0 active = db.scalars(select(Employee).where(Employee.status == "active")).all() diff --git a/app/templates/directory.html b/app/templates/directory.html index c0cd64e..2b4cd51 100644 --- a/app/templates/directory.html +++ b/app/templates/directory.html @@ -55,6 +55,7 @@ Адрес Публикации Курсы + Новости Впервые найден Последний раз найден Дата увольнения @@ -73,13 +74,14 @@ {{ employee.address or "" }} {{ employee.publications_count }} {{ employee.courses_count }} + {{ employee.news_count }} {{ employee.first_seen_display }} {{ employee.last_seen_display }} {{ employee.dismissed_display }} Открыть {% else %} - По этим фильтрам сотрудники не найдены. + По этим фильтрам сотрудники не найдены. {% endfor %} @@ -106,7 +108,7 @@
- {% for key, label in [("full_name", "ФИО"), ("status", "Статус"), ("positions", "Должности"), ("hse_start_year", "Год начала"), ("email", "Email"), ("phone", "Телефон"), ("address", "Адрес"), ("publications_count", "Публикации"), ("courses_count", "Курсы"), ("first_seen_at", "Впервые найден"), ("last_seen_at", "Последний раз найден"), ("dismissed_at", "Дата увольнения"), ("profile", "Профиль")] %} + {% for key, label in [("full_name", "ФИО"), ("status", "Статус"), ("positions", "Должности"), ("hse_start_year", "Год начала"), ("email", "Email"), ("phone", "Телефон"), ("address", "Адрес"), ("publications_count", "Публикации"), ("courses_count", "Курсы"), ("news_count", "Новости"), ("first_seen_at", "Впервые найден"), ("last_seen_at", "Последний раз найден"), ("dismissed_at", "Дата увольнения"), ("profile", "Профиль")] %} {% endfor %}
diff --git a/app/templates/employee_detail.html b/app/templates/employee_detail.html index 9809566..591f29c 100644 --- a/app/templates/employee_detail.html +++ b/app/templates/employee_detail.html @@ -104,6 +104,25 @@ {% endif %} + {% if employee_view.news_links %} +
+

В новостях

+ +
+ {% endif %} +

Разделы профиля

{% if employee_view.sections %} diff --git a/app/version.py b/app/version.py index 66c30a3..8937d1a 100644 --- a/app/version.py +++ b/app/version.py @@ -1,3 +1,3 @@ -APP_VERSION = "0.6.2" -FRONTEND_VERSION = "0.6.2" -BACKEND_VERSION = "0.6.2" +APP_VERSION = "0.7.0" +FRONTEND_VERSION = "0.7.0" +BACKEND_VERSION = "0.7.0" diff --git a/migrations/007_employee_news_links.sql b/migrations/007_employee_news_links.sql new file mode 100644 index 0000000..4935cf1 --- /dev/null +++ b/migrations/007_employee_news_links.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS employee_news_links ( + id SERIAL PRIMARY KEY, + employee_id INTEGER NOT NULL REFERENCES employees(id) ON DELETE CASCADE, + title TEXT NOT NULL, + url TEXT, + summary TEXT, + published_at TIMESTAMPTZ, + published_year INTEGER, + source_hash VARCHAR(64) NOT NULL, + raw_data JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + CONSTRAINT uq_employee_news_links_employee_url UNIQUE (employee_id, url), + CONSTRAINT uq_employee_news_links_employee_source_hash UNIQUE (employee_id, source_hash) +); + +CREATE INDEX IF NOT EXISTS ix_employee_news_links_employee_id + ON employee_news_links (employee_id); + +CREATE INDEX IF NOT EXISTS ix_employee_news_links_url + ON employee_news_links (url); + +CREATE INDEX IF NOT EXISTS ix_employee_news_links_published_at + ON employee_news_links (published_at); + +CREATE INDEX IF NOT EXISTS ix_employee_news_links_published_year + ON employee_news_links (published_year); diff --git a/pyproject.toml b/pyproject.toml index 93967f1..a6e5830 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "miem-workers" -version = "0.6.2" +version = "0.7.0" description = "MIEM employees parser, admin API, and MCP server" requires-python = ">=3.11" dependencies = [ diff --git a/tests/test_admin_data.py b/tests/test_admin_data.py index 3206542..9bf2018 100644 --- a/tests/test_admin_data.py +++ b/tests/test_admin_data.py @@ -1,6 +1,6 @@ from datetime import datetime, timezone -from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee +from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeNewsLink from app.services.admin_data import ( employee_detail_payload, employee_display_payload, @@ -35,6 +35,7 @@ def test_employee_display_payload_extracts_common_fields(db_session): "sections": [ {"type": "publications", "publications": [{"title": "Paper"}]}, {"type": "courses_by_year", "courses": [{"title": "Course"}]}, + {"type": "news", "news_links": [{"title": "News", "url": "https://example.test/news"}]}, ], }, ) @@ -46,6 +47,7 @@ def test_employee_display_payload_extracts_common_fields(db_session): assert payload["email_text"] == "person@hse.ru" assert payload["publications_count"] == 1 assert payload["courses_count"] == 1 + assert payload["news_count"] == 1 assert payload["first_seen_display"] != "Не указано" @@ -104,6 +106,19 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session): "type": "generic", "raw_text": "Fallback text", }, + { + "title": "В новостях", + "type": "news", + "news_links": [ + { + "title": "News title", + "url": "https://example.test/news", + "summary": "News summary", + "published_at": "2026-04-28T00:00:00+00:00", + "published_year": 2026, + } + ], + }, ], }, ) @@ -118,6 +133,41 @@ def test_employee_detail_payload_normalizes_human_readable_sections(db_session): assert payload["sections"][2]["courses"][0]["title"] == "Course" assert payload["sections"][3]["theses"][0]["student"] == "Student Name" assert payload["sections"][4]["paragraphs"] == ["Fallback text"] + assert payload["sections"][5]["news_links"][0]["title"] == "News title" + assert payload["news_links"][0]["published_display"] == "28.04.2026" + + +def test_employee_payload_prefers_stored_news_links(db_session): + employee = Employee( + profile_key="staff:news", + canonical_url="https://www.hse.ru/staff/news", + full_name="News Person", + status="active", + first_seen_at=datetime.now(timezone.utc), + last_seen_at=datetime.now(timezone.utc), + current_data={"sections": [{"type": "news", "news_links": [{"title": "Old news"}]}]}, + ) + db_session.add(employee) + db_session.commit() + db_session.add( + EmployeeNewsLink( + employee_id=employee.id, + title="Stored news", + url="https://example.test/stored", + summary="Stored summary", + published_at=datetime(2026, 4, 28, tzinfo=timezone.utc), + published_year=2026, + source_hash="b" * 64, + ) + ) + db_session.commit() + + display = employee_display_payload(employee) + detail = employee_detail_payload(employee) + + assert display["news_count"] == 1 + assert detail["news_links"][0]["title"] == "Stored news" + assert detail["news_links"][0]["published_display"] == "28.04.2026" def test_employee_payloads_tolerate_malformed_current_data(db_session): diff --git a/tests/test_admin_templates.py b/tests/test_admin_templates.py index fe50d5a..d0b0679 100644 --- a/tests/test_admin_templates.py +++ b/tests/test_admin_templates.py @@ -22,6 +22,8 @@ def test_directory_template_is_russian_and_uses_display_dates(): assert "На странице: {{ value }}" in template assert "{% for value in [25, 50, 100] %}" in template assert "Найдено:" in template + assert "Новости" in template + assert "employee.news_count" in template assert "employee.first_seen_display" in template assert "employee.last_seen_display" in template assert "employee.dismissed_display" in template diff --git a/tests/test_api_mcp.py b/tests/test_api_mcp.py index ca43e22..d2e25f9 100644 --- a/tests/test_api_mcp.py +++ b/tests/test_api_mcp.py @@ -20,7 +20,7 @@ def test_health_returns_versions(): response = client.get("/api/health") assert response.status_code == 200 - assert response.json()["backend_version"] == "0.6.2" + assert response.json()["backend_version"] == "0.7.0" def test_mcp_lists_tools_without_auth_and_ignores_auth_header(): @@ -154,7 +154,7 @@ def test_mcp_service_info_returns_tools_and_dataset_hash(): assert response.status_code == 200 payload = json.loads(response.json()["result"]["content"][0]["text"]) assert payload["service_name"] == "miem-employees" - assert payload["backend_version"] == "0.6.2" + assert payload["backend_version"] == "0.7.0" assert payload["dataset"]["hash"] assert any(tool["name"] == "sync_employees" for tool in payload["tools"]) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 430606a..d40da22 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,7 +1,16 @@ import gzip from datetime import datetime, timezone -from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeePublication, EmployeeSnapshot, ParseResourceCache +from app.models import ( + CrawlError, + CrawlRun, + CrawlRunEmployeeChange, + Employee, + EmployeeNewsLink, + EmployeePublication, + EmployeeSnapshot, + ParseResourceCache, +) from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee from app.services.resource_cache import ResourceCache @@ -253,6 +262,44 @@ def test_upsert_employee_records_publication_errors_without_failing_employee(mon assert "публикации" in error.message.lower() +def test_upsert_employee_saves_news_links_and_reuses_existing_rows(db_session): + first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") + second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") + db_session.add_all([first_run, second_run]) + db_session.commit() + + employee, _ = _upsert_employee(db_session, first_run, _parsed_employee_with_news("news-person")) + db_session.commit() + _upsert_employee(db_session, second_run, _parsed_employee_with_news("news-person")) + db_session.commit() + + news_links = db_session.query(EmployeeNewsLink).filter_by(employee_id=employee.id).all() + assert len(news_links) == 1 + assert news_links[0].title == "News Title" + assert news_links[0].url == "https://www.hse.ru/news/1.html" + assert news_links[0].published_year == 2026 + + +def test_upsert_employee_records_news_errors_without_failing_employee(monkeypatch, db_session): + run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") + db_session.add(run) + db_session.commit() + + def broken_sync(*_args, **_kwargs): + raise RuntimeError("boom") + + monkeypatch.setattr("app.services.crawler._sync_employee_news_links", broken_sync) + + employee, changed = _upsert_employee(db_session, run, _parsed_employee_with_news("news-error-safe")) + db_session.commit() + + assert changed is True + assert employee.full_name == "Same Person" + assert db_session.query(Employee).filter_by(profile_key="staff:news-error-safe").one() + error = db_session.query(CrawlError).one() + assert "новости" in error.message.lower() + + def test_checksum_changes_when_widget_data_changes(): base = _parsed_employee("widgets") changed = _parsed_employee("widgets") @@ -314,3 +361,23 @@ def _parsed_employee_with_publication(profile_id: str) -> dict: } ] return parsed + + +def _parsed_employee_with_news(profile_id: str) -> dict: + parsed = _parsed_employee(profile_id) + parsed["sections"] = [ + { + "type": "news", + "news_links": [ + { + "title": "News Title", + "url": "https://www.hse.ru/news/1.html", + "summary": "News summary", + "published_at": "2026-04-28T00:00:00+00:00", + "published_year": 2026, + "raw_data": {"title": "News Title", "url": "https://www.hse.ru/news/1.html"}, + } + ], + } + ] + return parsed diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py index a250b25..2fef31e 100644 --- a/tests/test_db_schema.py +++ b/tests/test_db_schema.py @@ -69,3 +69,47 @@ def test_runtime_schema_creates_employee_publications_table_when_employees_exist assert "employee_publications" in inspector.get_table_names() columns = {column["name"] for column in inspector.get_columns("employee_publications")} assert {"employee_id", "publication_id", "doi_url", "authors", "raw_data", "source_hash"}.issubset(columns) + + +def test_runtime_schema_creates_employee_news_links_table_when_employees_exist(monkeypatch): + engine = create_engine("sqlite:///:memory:") + with engine.begin() as connection: + connection.execute( + text( + """ + CREATE TABLE employees ( + id INTEGER PRIMARY KEY, + profile_key VARCHAR(255) NOT NULL UNIQUE, + canonical_url TEXT NOT NULL, + status VARCHAR(32) NOT NULL DEFAULT 'active', + first_seen_at DATETIME NOT NULL, + last_seen_at DATETIME NOT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL + ) + """ + ) + ) + connection.execute( + text( + """ + CREATE TABLE crawl_runs ( + id INTEGER PRIMARY KEY, + source_url TEXT NOT NULL, + status VARCHAR(32) NOT NULL DEFAULT 'running', + found_count INTEGER NOT NULL DEFAULT 0, + parsed_count INTEGER NOT NULL DEFAULT 0, + skipped_count INTEGER NOT NULL DEFAULT 0 + ) + """ + ) + ) + monkeypatch.setattr("app.db.engine", engine) + + _ensure_runtime_schema() + _ensure_runtime_schema() + + inspector = inspect(engine) + assert "employee_news_links" in inspector.get_table_names() + columns = {column["name"] for column in inspector.get_columns("employee_news_links")} + assert {"employee_id", "title", "url", "summary", "published_at", "published_year", "source_hash", "raw_data"}.issubset(columns) diff --git a/tests/test_employee_detail_template.py b/tests/test_employee_detail_template.py index 840fd32..1c69087 100644 --- a/tests/test_employee_detail_template.py +++ b/tests/test_employee_detail_template.py @@ -13,6 +13,9 @@ def test_employee_detail_template_is_human_readable(): assert "section.list_items" in template assert "Основная информация" in template assert "Контакты" in template + assert "В новостях" in template + assert "employee_view.news_links" in template + assert "news.summary" in template assert "Разделы профиля" in template assert "graduation_theses" in template assert "Год защиты" in template diff --git a/tests/test_parser.py b/tests/test_parser.py index 1120d05..ec7290e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -232,3 +232,45 @@ def test_news_heading_with_publications_word_does_not_absorb_widget_publications assert len(publications) == 1 assert publications[0]["title"] == "Публикации и исследования" assert publications[0]["publications_count"] == 1 + + +def test_extract_sections_parses_employee_news_links(): + soup = BeautifulSoup( + """ + + """, + "html.parser", + ) + + sections = extract_sections(soup, "https://www.hse.ru/staff/avsergeev") + + assert len(sections) == 1 + news = sections[0] + assert news["type"] == "news" + assert news["news_count"] == 2 + assert news["news_links"][0]["title"] == "Как финал ВсОШ формирует кадры" + assert news["news_links"][0]["url"] == "https://www.hse.ru/news/edu/1153850518.html" + assert news["news_links"][0]["summary"] == "Краткое описание новости." + assert news["news_links"][0]["published_at"] == "2026-04-28T00:00:00+00:00" + assert news["news_links"][0]["published_year"] == 2026