miem_workers/app/parser/profile_url.py

import re
from urllib.parse import urljoin, urlsplit, urlunsplit

BASE_URL = "https://www.hse.ru"

_ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$")
_STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$")


def normalize_profile_url(href: str | None) -> str | None:
    if not href:
        return None
    candidate = urljoin(BASE_URL + "/", href.strip())
    split = urlsplit(candidate)
    path = split.path.rstrip("/")

    org_match = _ORG_PATTERN.match(path + "/")
    if org_match:
        return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", ""))

    staff_match = _STAFF_PATTERN.match(path + "/")
    if staff_match:
        return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", ""))

    return None


def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]:
    normalized = normalize_profile_url(profile_url)
    if not normalized:
        return None, None
    path = urlsplit(normalized).path.rstrip("/")
    org_match = _ORG_PATTERN.match(path + "/")
    if org_match:
        return "org_person", org_match.group(1)
    staff_match = _STAFF_PATTERN.match(path + "/")
    if staff_match:
        return "staff", staff_match.group(1)
    return None, None


def profile_key(profile_url: str) -> str | None:
    profile_type, profile_id = parse_profile_identity(profile_url)
    if not profile_type or not profile_id:
        return None
    return f"{profile_type}:{profile_id}"