feature: add MIEM employees parser service with admin UI and MCP
This commit is contained in:
380
app/parser/profile.py
Normal file
380
app/parser/profile.py
Normal file
@@ -0,0 +1,380 @@
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from requests import Session
|
||||
|
||||
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
|
||||
from app.version import BACKEND_VERSION
|
||||
|
||||
_YEAR_PATTERN = re.compile(r"Начал[аи]?\s+работать.*?ВШЭ.*?(\d{4})", re.IGNORECASE)
|
||||
_EMAIL_PATTERN = re.compile(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})")
|
||||
_PHONE_PATTERN = re.compile(r"(?:Телефон|Phone)\s*:\s*([+()\d\-\s]{8,})", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_ws(value: str | None) -> str:
|
||||
return re.sub(r"\s+", " ", value or "").strip()
|
||||
|
||||
|
||||
def extract_person_tabs(soup: BeautifulSoup, source_url: str) -> list[dict[str, str | None]]:
|
||||
selectors = (
|
||||
"div.person-menu.is-desktop.small.person-menu-addition",
|
||||
".person-menu",
|
||||
)
|
||||
for selector in selectors:
|
||||
menu = soup.select_one(selector)
|
||||
if not menu:
|
||||
continue
|
||||
tabs = []
|
||||
for anchor in menu.select("a[href]"):
|
||||
title = normalize_ws(anchor.get_text(" ", strip=True))
|
||||
href = anchor.get("href", "").strip()
|
||||
if title and href:
|
||||
tabs.append(
|
||||
{
|
||||
"data_index": anchor.get("data-index"),
|
||||
"title": title,
|
||||
"href": urljoin(source_url, href),
|
||||
}
|
||||
)
|
||||
if tabs:
|
||||
return _dedupe_tabs(tabs)
|
||||
return []
|
||||
|
||||
|
||||
def extract_person_header(soup: BeautifulSoup, source_url: str) -> dict:
|
||||
name_node = soup.select_one("h1.person-caption") or soup.find("h1")
|
||||
text = normalize_ws(soup.get_text(" ", strip=True))
|
||||
year_match = _YEAR_PATTERN.search(text)
|
||||
contacts = {"phones": [], "emails": [], "address": None, "items": []}
|
||||
|
||||
for email in _EMAIL_PATTERN.findall(text):
|
||||
if email not in contacts["emails"]:
|
||||
contacts["emails"].append(email)
|
||||
for phone in _PHONE_PATTERN.findall(text):
|
||||
normalized_phone = normalize_ws(phone)
|
||||
if normalized_phone and normalized_phone not in contacts["phones"]:
|
||||
contacts["phones"].append(normalized_phone)
|
||||
|
||||
address_match = re.search(
|
||||
r"(Адрес[:\s].{0,220}?)(?:\s+Время|\s+Расписание|\s+SPIN|\s+ORCID|$)",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if address_match:
|
||||
contacts["address"] = normalize_ws(address_match.group(1)).rstrip(",")
|
||||
|
||||
positions = []
|
||||
for li in soup.select("ul.g-ul.g-list.small.employment-add li, ul.employment-add li"):
|
||||
value = normalize_ws(li.get_text(" ", strip=True))
|
||||
if value:
|
||||
positions.append(value)
|
||||
|
||||
external_ids = []
|
||||
id_domains = (
|
||||
("ORCID", "orcid.org"),
|
||||
("Scopus AuthorID", "scopus.com"),
|
||||
("ResearcherID", "webofscience.com"),
|
||||
("Google Scholar", "scholar.google."),
|
||||
("SPIN РИНЦ", "elibrary.ru"),
|
||||
)
|
||||
for anchor in soup.select("a[href]"):
|
||||
href = anchor.get("href", "").strip()
|
||||
label = normalize_ws(anchor.get_text(" ", strip=True))
|
||||
for system, marker in id_domains:
|
||||
if marker in href:
|
||||
external_ids.append({"system": system, "value": label or system, "url": href})
|
||||
break
|
||||
|
||||
return {
|
||||
"source_url": source_url,
|
||||
"full_name": normalize_ws(name_node.get_text(" ", strip=True)) if name_node else None,
|
||||
"positions": positions,
|
||||
"hse_start_year": int(year_match.group(1)) if year_match else None,
|
||||
"contacts": contacts,
|
||||
"external_ids": _dedupe_dicts(external_ids),
|
||||
}
|
||||
|
||||
|
||||
def extract_sections(soup: BeautifulSoup, source_url: str) -> list[dict]:
|
||||
sections = []
|
||||
for h2 in soup.select("h2"):
|
||||
title = normalize_ws(h2.get_text(" ", strip=True))
|
||||
if not title or "расписание занятий" in title.lower():
|
||||
continue
|
||||
nodes = _collect_between_h2(h2)
|
||||
raw_text = _nodes_raw_text(nodes)
|
||||
paragraphs = _nodes_paragraphs(nodes)
|
||||
items = _nodes_list_items(nodes)
|
||||
links = []
|
||||
for node in nodes:
|
||||
if isinstance(node, Tag):
|
||||
links.extend(_extract_links(node, source_url))
|
||||
|
||||
section_type = _infer_section_type(title, nodes)
|
||||
section = {
|
||||
"title": title,
|
||||
"slug": _slugify(title),
|
||||
"type": section_type,
|
||||
"raw_text": raw_text,
|
||||
"paragraphs": paragraphs,
|
||||
"items": items,
|
||||
"links": links,
|
||||
}
|
||||
|
||||
if section_type == "publications":
|
||||
section["publications_count"], section["publications"] = _parse_publications(title, nodes, source_url)
|
||||
section["items"] = [item["text"] for item in section["publications"] if item.get("text")]
|
||||
elif section_type == "courses_by_year":
|
||||
section["academic_year"], section["courses"] = _parse_courses(title, nodes, source_url)
|
||||
section.pop("items", None)
|
||||
section.pop("links", None)
|
||||
elif section_type == "table":
|
||||
section["table"] = _parse_table(nodes, source_url)
|
||||
elif "выпускные квалификационные работы студентов ниу вшэ" in title.lower():
|
||||
section["items"] = _parse_vkr_items(nodes)
|
||||
|
||||
year_entries = _parse_year_entries(nodes, source_url)
|
||||
if year_entries:
|
||||
section["year_entries"] = year_entries
|
||||
if section_type in {"generic", "paragraphs"}:
|
||||
section["type"] = "year_blocks"
|
||||
sections.append(section)
|
||||
return sections
|
||||
|
||||
|
||||
def parse_person_profile(
|
||||
session: Session,
|
||||
source_url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
use_playwright: bool = False,
|
||||
) -> dict | None:
|
||||
normalized_url = normalize_profile_url(source_url)
|
||||
if not normalized_url:
|
||||
return None
|
||||
response = session.get(normalized_url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
if use_playwright:
|
||||
html = _render_with_playwright(normalized_url, html)
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
profile_type, profile_id = parse_profile_identity(normalized_url)
|
||||
header = extract_person_header(soup, normalized_url)
|
||||
tabs = extract_person_tabs(soup, normalized_url)
|
||||
sections = extract_sections(soup, normalized_url)
|
||||
internal_links = [tab["href"] for tab in tabs if tab.get("href")]
|
||||
|
||||
return {
|
||||
"source_url": normalized_url,
|
||||
"profile_type": profile_type,
|
||||
"profile_id": profile_id,
|
||||
"full_name": header.get("full_name"),
|
||||
"positions": header.get("positions") or [],
|
||||
"hse_start_year": header.get("hse_start_year"),
|
||||
"contacts": header.get("contacts") or {},
|
||||
"external_ids": header.get("external_ids") or [],
|
||||
"tabs": tabs,
|
||||
"sections": sections,
|
||||
"employee_internal_links": internal_links,
|
||||
"parser_version": BACKEND_VERSION,
|
||||
"_html": html,
|
||||
}
|
||||
|
||||
|
||||
def _render_with_playwright(source_url: str, fallback_html: str) -> str:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except Exception:
|
||||
return fallback_html
|
||||
try:
|
||||
with sync_playwright() as playwright:
|
||||
browser = playwright.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
page.goto(source_url, wait_until="domcontentloaded", timeout=45000)
|
||||
for index in range(page.locator(".person-menu a").count()):
|
||||
try:
|
||||
page.locator(".person-menu a").nth(index).click(timeout=2500, force=True)
|
||||
page.wait_for_timeout(450)
|
||||
except Exception:
|
||||
continue
|
||||
html = page.content()
|
||||
browser.close()
|
||||
return html
|
||||
except Exception:
|
||||
return fallback_html
|
||||
|
||||
|
||||
def _collect_between_h2(start_h2: Tag) -> list[Tag | NavigableString | str]:
|
||||
nodes = []
|
||||
for sibling in start_h2.next_siblings:
|
||||
if isinstance(sibling, Tag) and sibling.name == "h2":
|
||||
break
|
||||
if isinstance(sibling, NavigableString) and not normalize_ws(str(sibling)):
|
||||
continue
|
||||
nodes.append(sibling)
|
||||
return nodes
|
||||
|
||||
|
||||
def _extract_links(node: Tag, source_url: str) -> list[dict[str, str]]:
|
||||
links = []
|
||||
for anchor in node.select("a[href]"):
|
||||
text = normalize_ws(anchor.get_text(" ", strip=True))
|
||||
href = anchor.get("href", "").strip()
|
||||
if text and href and "timetable" not in href.lower() and "расписание" not in text.lower():
|
||||
links.append({"text": text, "url": urljoin(source_url, href)})
|
||||
return links
|
||||
|
||||
|
||||
def _nodes_raw_text(nodes: list) -> str:
|
||||
chunks = []
|
||||
for node in nodes:
|
||||
text = normalize_ws(node.get_text(" ", strip=True) if isinstance(node, Tag) else str(node))
|
||||
if text:
|
||||
chunks.append(text)
|
||||
return "\n".join(chunks)
|
||||
|
||||
|
||||
def _nodes_paragraphs(nodes: list) -> list[str]:
|
||||
paragraphs = []
|
||||
for node in nodes:
|
||||
if isinstance(node, Tag):
|
||||
paragraphs.extend(normalize_ws(p.get_text(" ", strip=True)) for p in node.select("p"))
|
||||
return [p for p in paragraphs if p]
|
||||
|
||||
|
||||
def _nodes_list_items(nodes: list) -> list[str]:
|
||||
items = []
|
||||
for node in nodes:
|
||||
if isinstance(node, Tag):
|
||||
items.extend(normalize_ws(li.get_text(" ", strip=True)) for li in node.select("li"))
|
||||
return [item for item in items if item and "расписание" not in item.lower()]
|
||||
|
||||
|
||||
def _infer_section_type(title: str, nodes: list) -> str:
|
||||
lowered = title.lower()
|
||||
if _has_table(nodes):
|
||||
return "table"
|
||||
if "публикац" in lowered:
|
||||
return "publications"
|
||||
if "учебные курсы" in lowered:
|
||||
return "courses_by_year"
|
||||
if _nodes_list_items(nodes):
|
||||
return "list"
|
||||
if _nodes_paragraphs(nodes):
|
||||
return "paragraphs"
|
||||
return "generic"
|
||||
|
||||
|
||||
def _has_table(nodes: list) -> bool:
|
||||
return any(isinstance(node, Tag) and (node.name == "table" or node.find("table")) for node in nodes)
|
||||
|
||||
|
||||
def _parse_table(nodes: list, source_url: str) -> dict:
|
||||
for node in nodes:
|
||||
if not isinstance(node, Tag):
|
||||
continue
|
||||
table = node if node.name == "table" else node.find("table")
|
||||
if not table:
|
||||
continue
|
||||
headers = [normalize_ws(th.get_text(" ", strip=True)) for th in table.select("th")]
|
||||
rows = []
|
||||
for tr in table.select("tr"):
|
||||
cells = [normalize_ws(td.get_text(" ", strip=True)) for td in tr.select("td")]
|
||||
if cells:
|
||||
link = tr.find("a", href=True)
|
||||
rows.append({"cells": cells, "link_url": urljoin(source_url, link["href"]) if link else None})
|
||||
return {"headers": headers, "rows": rows}
|
||||
return {"headers": [], "rows": []}
|
||||
|
||||
|
||||
def _parse_publications(title: str, nodes: list, source_url: str) -> tuple[int | None, list[dict]]:
|
||||
count_match = re.search(r"(\d+)\s*$", title)
|
||||
publications = []
|
||||
for node in nodes:
|
||||
if not isinstance(node, Tag):
|
||||
continue
|
||||
for li in node.select("li"):
|
||||
text = normalize_ws(li.get_text(" ", strip=True))
|
||||
anchor = li.find("a", href=True)
|
||||
if text:
|
||||
publications.append(
|
||||
{
|
||||
"title": normalize_ws(anchor.get_text(" ", strip=True)) if anchor else text,
|
||||
"url": urljoin(source_url, anchor["href"]) if anchor else None,
|
||||
"text": text,
|
||||
}
|
||||
)
|
||||
if publications:
|
||||
break
|
||||
if not publications:
|
||||
publications = [{"title": line, "url": None, "text": line} for line in _nodes_raw_text(nodes).split("\n") if line]
|
||||
return int(count_match.group(1)) if count_match else None, publications
|
||||
|
||||
|
||||
def _parse_courses(title: str, nodes: list, source_url: str) -> tuple[str | None, list[dict]]:
|
||||
year_match = re.search(r"(\d{4}/\d{4})", title)
|
||||
courses = []
|
||||
for node in nodes:
|
||||
if isinstance(node, Tag):
|
||||
for li in node.select("li"):
|
||||
anchor = li.find("a", href=True)
|
||||
course_title = normalize_ws(anchor.get_text(" ", strip=True) if anchor else li.get_text(" ", strip=True))
|
||||
if course_title:
|
||||
courses.append({"title": course_title, "url": urljoin(source_url, anchor["href"]) if anchor else None})
|
||||
return year_match.group(1) if year_match else None, _dedupe_dicts(courses)
|
||||
|
||||
|
||||
def _parse_year_entries(nodes: list, source_url: str) -> list[dict]:
|
||||
entries = []
|
||||
for node in nodes:
|
||||
if not isinstance(node, Tag):
|
||||
continue
|
||||
for year_node in node.select(".person-list-hangover"):
|
||||
year_match = re.search(r"(19\d{2}|20\d{2})", year_node.get_text(" ", strip=True))
|
||||
parent = year_node.parent
|
||||
if parent:
|
||||
entries.append(
|
||||
{
|
||||
"year": int(year_match.group(1)) if year_match else None,
|
||||
"text": normalize_ws(parent.get_text(" ", strip=True)),
|
||||
"links": _extract_links(parent, source_url),
|
||||
}
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def _parse_vkr_items(nodes: list) -> list[str]:
|
||||
items = []
|
||||
for node in nodes:
|
||||
if isinstance(node, Tag):
|
||||
items.extend(normalize_ws(li.get_text(" ", strip=True)) for li in node.select("li"))
|
||||
return [item for item in dict.fromkeys(items) if item]
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
cleaned = re.sub(r"[^\w\s-]", "", value.lower(), flags=re.UNICODE)
|
||||
return re.sub(r"[-\s]+", "_", cleaned).strip("_") or "section"
|
||||
|
||||
|
||||
def _dedupe_tabs(items: list[dict]) -> list[dict]:
|
||||
seen = set()
|
||||
unique = []
|
||||
for item in items:
|
||||
key = (item.get("title"), item.get("href"))
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
|
||||
def _dedupe_dicts(items: list[dict]) -> list[dict]:
|
||||
seen = set()
|
||||
unique = []
|
||||
for item in items:
|
||||
key = tuple(sorted(item.items()))
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(item)
|
||||
return unique
|
||||
Reference in New Issue
Block a user