feat: add employee news links parsing and storage

This commit is contained in:
Anton
2026-05-22 18:50:25 +03:00
parent 680ac6e980
commit 4d2a071ec0
19 changed files with 636 additions and 16 deletions

View File

@@ -1,6 +1,7 @@
import hashlib
import json
import re
from datetime import datetime, timezone
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString, Tag
@@ -101,6 +102,8 @@ def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict:
def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
sections = []
for h2 in soup.select("h2"):
if h2.find_parent(class_="post") or h2.find_parent(attrs={"data-tab": "press_links_news"}):
continue
title = normalize_ws(h2.get_text(" ", strip=True))
if not title or "расписание занятий" in title.lower():
continue
@@ -142,6 +145,21 @@ def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
if section_type in {"generic", "paragraphs"}:
section["type"] = "year_blocks"
sections.append(section)
news_links = _parse_news_links(soup, source_url)
if news_links:
sections.append(
{
"title": "В новостях",
"slug": "v_novostyah",
"type": "news",
"raw_text": "",
"paragraphs": [],
"items": [item["title"] for item in news_links if item.get("title")],
"links": [{"text": item["title"], "url": item["url"]} for item in news_links if item.get("title") and item.get("url")],
"news_count": len(news_links),
"news_links": news_links,
}
)
return sections
@@ -575,6 +593,95 @@ def _parse_vkr_items(nodes: list) -> list[str]:
return [item for item in dict.fromkeys(items) if item]
def _parse_news_links(soup: BeautifulSoup, source_url: str) -> list[dict]:
news = []
for post in soup.select('[data-tab="press_links_news"] .post'):
if not isinstance(post, Tag):
continue
anchor = post.select_one(".post__content h2 a[href], h2 a[href], a[href]")
title = normalize_ws(anchor.get_text(" ", strip=True)) if anchor else ""
href = normalize_ws(anchor.get("href")) if anchor else ""
summary_node = post.select_one(".post__text")
summary = normalize_ws(summary_node.get_text(" ", strip=True)) if summary_node else ""
published_at = _parse_post_date(post)
if not title and not href:
continue
item = {
"title": title or href,
"url": urljoin(source_url, href) if href else None,
"summary": summary or None,
"published_at": published_at.isoformat() if published_at else None,
"published_year": published_at.year if published_at else _int_or_none(normalize_ws(_select_text(post, ".post-meta__year"))),
"raw_data": {
"title": title or href,
"url": href or None,
"summary": summary or None,
"date_text": normalize_ws(_select_text(post, ".post-meta__date")),
},
}
news.append(item)
return _dedupe_news_links(news)
def _select_text(node: Tag, selector: str) -> str:
selected = node.select_one(selector)
return selected.get_text(" ", strip=True) if selected else ""
def _parse_post_date(post: Tag) -> datetime | None:
day = _int_or_none(normalize_ws(_select_text(post, ".post-meta__day")))
month = _month_number(normalize_ws(_select_text(post, ".post-meta__month")))
year = _int_or_none(normalize_ws(_select_text(post, ".post-meta__year")))
if not day or not month or not year:
return None
try:
return datetime(year, month, day, tzinfo=timezone.utc)
except ValueError:
return None
def _month_number(value: str) -> int | None:
lowered = value.lower().strip(".")
months = {
"янв": 1,
"январь": 1,
"января": 1,
"фев": 2,
"февр": 2,
"февраль": 2,
"февраля": 2,
"март": 3,
"мар": 3,
"марта": 3,
"апр": 4,
"апрель": 4,
"апреля": 4,
"май": 5,
"мая": 5,
"июнь": 6,
"июня": 6,
"июль": 7,
"июля": 7,
"авг": 8,
"август": 8,
"августа": 8,
"сент": 9,
"сен": 9,
"сентябрь": 9,
"сентября": 9,
"окт": 10,
"октябрь": 10,
"октября": 10,
"нояб": 11,
"ноябрь": 11,
"ноября": 11,
"дек": 12,
"декабрь": 12,
"декабря": 12,
}
return months.get(lowered)
def _normalize_publication_item(item: dict, current_author_id: str | None = None) -> dict:
publication_id = str(item.get("id") or "").strip()
title = _html_to_text(item.get("title"))
@@ -698,6 +805,17 @@ def _dedupe_publications(items: list[dict]) -> list[dict]:
return unique
def _dedupe_news_links(items: list[dict]) -> list[dict]:
seen = set()
unique = []
for item in items:
key = item.get("url") or item.get("title")
if key and key not in seen:
seen.add(key)
unique.append(item)
return unique
def _html_to_text(value: object) -> str:
return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))