Files
miem_workers/app/parser/profile_url.py

47 lines
1.4 KiB
Python

import re
from urllib.parse import urljoin, urlsplit, urlunsplit
BASE_URL = "https://www.hse.ru"
_ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$")
_STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$")
def normalize_profile_url(href: str | None) -> str | None:
if not href:
return None
candidate = urljoin(BASE_URL + "/", href.strip())
split = urlsplit(candidate)
path = split.path.rstrip("/")
org_match = _ORG_PATTERN.match(path + "/")
if org_match:
return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", ""))
staff_match = _STAFF_PATTERN.match(path + "/")
if staff_match:
return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", ""))
return None
def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]:
normalized = normalize_profile_url(profile_url)
if not normalized:
return None, None
path = urlsplit(normalized).path.rstrip("/")
org_match = _ORG_PATTERN.match(path + "/")
if org_match:
return "org_person", org_match.group(1)
staff_match = _STAFF_PATTERN.match(path + "/")
if staff_match:
return "staff", staff_match.group(1)
return None, None
def profile_key(profile_url: str) -> str | None:
profile_type, profile_id = parse_profile_identity(profile_url)
if not profile_type or not profile_id:
return None
return f"{profile_type}:{profile_id}"