feat: add employee news links parsing and storage

2026-05-22 18:50:25 +03:00
parent 680ac6e980
commit 4d2a071ec0
19 changed files with 636 additions and 16 deletions
--- a/app/parser/profile.py
+++ b/app/parser/profile.py
@@ -1,6 +1,7 @@
 import hashlib
 import json
 import re
+from datetime import datetime, timezone
 from urllib.parse import urljoin

 from bs4 import BeautifulSoup, NavigableString, Tag
@@ -101,6 +102,8 @@ def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict:
 def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
    sections = []
    for h2 in soup.select("h2"):
+        if h2.find_parent(class_="post") or h2.find_parent(attrs={"data-tab": "press_links_news"}):
+            continue
        title = normalize_ws(h2.get_text(" ", strip=True))
        if not title or "расписание занятий" in title.lower():
            continue
@@ -142,6 +145,21 @@ def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
            if section_type in {"generic", "paragraphs"}:
                section["type"] = "year_blocks"
        sections.append(section)
+    news_links = _parse_news_links(soup, source_url)
+    if news_links:
+        sections.append(
+            {
+                "title": "В новостях",
+                "slug": "v_novostyah",
+                "type": "news",
+                "raw_text": "",
+                "paragraphs": [],
+                "items": [item["title"] for item in news_links if item.get("title")],
+                "links": [{"text": item["title"], "url": item["url"]} for item in news_links if item.get("title") and item.get("url")],
+                "news_count": len(news_links),
+                "news_links": news_links,
+            }
+        )
    return sections


@@ -575,6 +593,95 @@ def _parse_vkr_items(nodes: list) -> list[str]:
    return [item for item in dict.fromkeys(items) if item]


+def _parse_news_links(soup: BeautifulSoup, source_url: str) -> list[dict]:
+    news = []
+    for post in soup.select('[data-tab="press_links_news"] .post'):
+        if not isinstance(post, Tag):
+            continue
+        anchor = post.select_one(".post__content h2 a[href], h2 a[href], a[href]")
+        title = normalize_ws(anchor.get_text(" ", strip=True)) if anchor else ""
+        href = normalize_ws(anchor.get("href")) if anchor else ""
+        summary_node = post.select_one(".post__text")
+        summary = normalize_ws(summary_node.get_text(" ", strip=True)) if summary_node else ""
+        published_at = _parse_post_date(post)
+        if not title and not href:
+            continue
+        item = {
+            "title": title or href,
+            "url": urljoin(source_url, href) if href else None,
+            "summary": summary or None,
+            "published_at": published_at.isoformat() if published_at else None,
+            "published_year": published_at.year if published_at else _int_or_none(normalize_ws(_select_text(post, ".post-meta__year"))),
+            "raw_data": {
+                "title": title or href,
+                "url": href or None,
+                "summary": summary or None,
+                "date_text": normalize_ws(_select_text(post, ".post-meta__date")),
+            },
+        }
+        news.append(item)
+    return _dedupe_news_links(news)
+
+
+def _select_text(node: Tag, selector: str) -> str:
+    selected = node.select_one(selector)
+    return selected.get_text(" ", strip=True) if selected else ""
+
+
+def _parse_post_date(post: Tag) -> datetime | None:
+    day = _int_or_none(normalize_ws(_select_text(post, ".post-meta__day")))
+    month = _month_number(normalize_ws(_select_text(post, ".post-meta__month")))
+    year = _int_or_none(normalize_ws(_select_text(post, ".post-meta__year")))
+    if not day or not month or not year:
+        return None
+    try:
+        return datetime(year, month, day, tzinfo=timezone.utc)
+    except ValueError:
+        return None
+
+
+def _month_number(value: str) -> int | None:
+    lowered = value.lower().strip(".")
+    months = {
+        "янв": 1,
+        "январь": 1,
+        "января": 1,
+        "фев": 2,
+        "февр": 2,
+        "февраль": 2,
+        "февраля": 2,
+        "март": 3,
+        "мар": 3,
+        "марта": 3,
+        "апр": 4,
+        "апрель": 4,
+        "апреля": 4,
+        "май": 5,
+        "мая": 5,
+        "июнь": 6,
+        "июня": 6,
+        "июль": 7,
+        "июля": 7,
+        "авг": 8,
+        "август": 8,
+        "августа": 8,
+        "сент": 9,
+        "сен": 9,
+        "сентябрь": 9,
+        "сентября": 9,
+        "окт": 10,
+        "октябрь": 10,
+        "октября": 10,
+        "нояб": 11,
+        "ноябрь": 11,
+        "ноября": 11,
+        "дек": 12,
+        "декабрь": 12,
+        "декабря": 12,
+    }
+    return months.get(lowered)
+
+
 def _normalize_publication_item(item: dict, current_author_id: str | None = None) -> dict:
    publication_id = str(item.get("id") or "").strip()
    title = _html_to_text(item.get("title"))
@@ -698,6 +805,17 @@ def _dedupe_publications(items: list[dict]) -> list[dict]:
    return unique


+def _dedupe_news_links(items: list[dict]) -> list[dict]:
+    seen = set()
+    unique = []
+    for item in items:
+        key = item.get("url") or item.get("title")
+        if key and key not in seen:
+            seen.add(key)
+            unique.append(item)
+    return unique
+
+
 def _html_to_text(value: object) -> str:
    return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))