feat: add employee news links parsing and storage
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
@@ -101,6 +102,8 @@ def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict:
|
||||
def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
|
||||
sections = []
|
||||
for h2 in soup.select("h2"):
|
||||
if h2.find_parent(class_="post") or h2.find_parent(attrs={"data-tab": "press_links_news"}):
|
||||
continue
|
||||
title = normalize_ws(h2.get_text(" ", strip=True))
|
||||
if not title or "расписание занятий" in title.lower():
|
||||
continue
|
||||
@@ -142,6 +145,21 @@ def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
|
||||
if section_type in {"generic", "paragraphs"}:
|
||||
section["type"] = "year_blocks"
|
||||
sections.append(section)
|
||||
news_links = _parse_news_links(soup, source_url)
|
||||
if news_links:
|
||||
sections.append(
|
||||
{
|
||||
"title": "В новостях",
|
||||
"slug": "v_novostyah",
|
||||
"type": "news",
|
||||
"raw_text": "",
|
||||
"paragraphs": [],
|
||||
"items": [item["title"] for item in news_links if item.get("title")],
|
||||
"links": [{"text": item["title"], "url": item["url"]} for item in news_links if item.get("title") and item.get("url")],
|
||||
"news_count": len(news_links),
|
||||
"news_links": news_links,
|
||||
}
|
||||
)
|
||||
return sections
|
||||
|
||||
|
||||
@@ -575,6 +593,95 @@ def _parse_vkr_items(nodes: list) -> list[str]:
|
||||
return [item for item in dict.fromkeys(items) if item]
|
||||
|
||||
|
||||
def _parse_news_links(soup: BeautifulSoup, source_url: str) -> list[dict]:
|
||||
news = []
|
||||
for post in soup.select('[data-tab="press_links_news"] .post'):
|
||||
if not isinstance(post, Tag):
|
||||
continue
|
||||
anchor = post.select_one(".post__content h2 a[href], h2 a[href], a[href]")
|
||||
title = normalize_ws(anchor.get_text(" ", strip=True)) if anchor else ""
|
||||
href = normalize_ws(anchor.get("href")) if anchor else ""
|
||||
summary_node = post.select_one(".post__text")
|
||||
summary = normalize_ws(summary_node.get_text(" ", strip=True)) if summary_node else ""
|
||||
published_at = _parse_post_date(post)
|
||||
if not title and not href:
|
||||
continue
|
||||
item = {
|
||||
"title": title or href,
|
||||
"url": urljoin(source_url, href) if href else None,
|
||||
"summary": summary or None,
|
||||
"published_at": published_at.isoformat() if published_at else None,
|
||||
"published_year": published_at.year if published_at else _int_or_none(normalize_ws(_select_text(post, ".post-meta__year"))),
|
||||
"raw_data": {
|
||||
"title": title or href,
|
||||
"url": href or None,
|
||||
"summary": summary or None,
|
||||
"date_text": normalize_ws(_select_text(post, ".post-meta__date")),
|
||||
},
|
||||
}
|
||||
news.append(item)
|
||||
return _dedupe_news_links(news)
|
||||
|
||||
|
||||
def _select_text(node: Tag, selector: str) -> str:
|
||||
selected = node.select_one(selector)
|
||||
return selected.get_text(" ", strip=True) if selected else ""
|
||||
|
||||
|
||||
def _parse_post_date(post: Tag) -> datetime | None:
|
||||
day = _int_or_none(normalize_ws(_select_text(post, ".post-meta__day")))
|
||||
month = _month_number(normalize_ws(_select_text(post, ".post-meta__month")))
|
||||
year = _int_or_none(normalize_ws(_select_text(post, ".post-meta__year")))
|
||||
if not day or not month or not year:
|
||||
return None
|
||||
try:
|
||||
return datetime(year, month, day, tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _month_number(value: str) -> int | None:
|
||||
lowered = value.lower().strip(".")
|
||||
months = {
|
||||
"янв": 1,
|
||||
"январь": 1,
|
||||
"января": 1,
|
||||
"фев": 2,
|
||||
"февр": 2,
|
||||
"февраль": 2,
|
||||
"февраля": 2,
|
||||
"март": 3,
|
||||
"мар": 3,
|
||||
"марта": 3,
|
||||
"апр": 4,
|
||||
"апрель": 4,
|
||||
"апреля": 4,
|
||||
"май": 5,
|
||||
"мая": 5,
|
||||
"июнь": 6,
|
||||
"июня": 6,
|
||||
"июль": 7,
|
||||
"июля": 7,
|
||||
"авг": 8,
|
||||
"август": 8,
|
||||
"августа": 8,
|
||||
"сент": 9,
|
||||
"сен": 9,
|
||||
"сентябрь": 9,
|
||||
"сентября": 9,
|
||||
"окт": 10,
|
||||
"октябрь": 10,
|
||||
"октября": 10,
|
||||
"нояб": 11,
|
||||
"ноябрь": 11,
|
||||
"ноября": 11,
|
||||
"дек": 12,
|
||||
"декабрь": 12,
|
||||
"декабря": 12,
|
||||
}
|
||||
return months.get(lowered)
|
||||
|
||||
|
||||
def _normalize_publication_item(item: dict, current_author_id: str | None = None) -> dict:
|
||||
publication_id = str(item.get("id") or "").strip()
|
||||
title = _html_to_text(item.get("title"))
|
||||
@@ -698,6 +805,17 @@ def _dedupe_publications(items: list[dict]) -> list[dict]:
|
||||
return unique
|
||||
|
||||
|
||||
def _dedupe_news_links(items: list[dict]) -> list[dict]:
|
||||
seen = set()
|
||||
unique = []
|
||||
for item in items:
|
||||
key = item.get("url") or item.get("title")
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
|
||||
def _html_to_text(value: object) -> str:
|
||||
return normalize_ws(BeautifulSoup(str(value or ""), "html.parser").get_text(" ", strip=True))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user