import re from urllib.parse import urljoin, urlsplit, urlunsplit BASE_URL = "https://www.hse.ru" _ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$") _STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$") def normalize_profile_url(href: str | None) -> str | None: if not href: return None candidate = urljoin(BASE_URL + "/", href.strip()) split = urlsplit(candidate) path = split.path.rstrip("/") org_match = _ORG_PATTERN.match(path + "/") if org_match: return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", "")) staff_match = _STAFF_PATTERN.match(path + "/") if staff_match: return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", "")) return None def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]: normalized = normalize_profile_url(profile_url) if not normalized: return None, None path = urlsplit(normalized).path.rstrip("/") org_match = _ORG_PATTERN.match(path + "/") if org_match: return "org_person", org_match.group(1) staff_match = _STAFF_PATTERN.match(path + "/") if staff_match: return "staff", staff_match.group(1) return None, None def profile_key(profile_url: str) -> str | None: profile_type, profile_id = parse_profile_identity(profile_url) if not profile_type or not profile_id: return None return f"{profile_type}:{profile_id}"