47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
import re
|
|
from urllib.parse import urljoin, urlsplit, urlunsplit
|
|
|
|
BASE_URL = "https://www.hse.ru"
|
|
|
|
_ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$")
|
|
_STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$")
|
|
|
|
|
|
def normalize_profile_url(href: str | None) -> str | None:
|
|
if not href:
|
|
return None
|
|
candidate = urljoin(BASE_URL + "/", href.strip())
|
|
split = urlsplit(candidate)
|
|
path = split.path.rstrip("/")
|
|
|
|
org_match = _ORG_PATTERN.match(path + "/")
|
|
if org_match:
|
|
return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", ""))
|
|
|
|
staff_match = _STAFF_PATTERN.match(path + "/")
|
|
if staff_match:
|
|
return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", ""))
|
|
|
|
return None
|
|
|
|
|
|
def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]:
|
|
normalized = normalize_profile_url(profile_url)
|
|
if not normalized:
|
|
return None, None
|
|
path = urlsplit(normalized).path.rstrip("/")
|
|
org_match = _ORG_PATTERN.match(path + "/")
|
|
if org_match:
|
|
return "org_person", org_match.group(1)
|
|
staff_match = _STAFF_PATTERN.match(path + "/")
|
|
if staff_match:
|
|
return "staff", staff_match.group(1)
|
|
return None, None
|
|
|
|
|
|
def profile_key(profile_url: str) -> str | None:
|
|
profile_type, profile_id = parse_profile_identity(profile_url)
|
|
if not profile_type or not profile_id:
|
|
return None
|
|
return f"{profile_type}:{profile_id}"
|