feature: add MIEM employees parser service with admin UI and MCP
This commit is contained in:
46
app/parser/profile_url.py
Normal file
46
app/parser/profile_url.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import re
|
||||
from urllib.parse import urljoin, urlsplit, urlunsplit
|
||||
|
||||
BASE_URL = "https://www.hse.ru"
|
||||
|
||||
_ORG_PATTERN = re.compile(r"^/org/persons/(\d+)/?$")
|
||||
_STAFF_PATTERN = re.compile(r"^/staff/([^/?#]+)/?$")
|
||||
|
||||
|
||||
def normalize_profile_url(href: str | None) -> str | None:
|
||||
if not href:
|
||||
return None
|
||||
candidate = urljoin(BASE_URL + "/", href.strip())
|
||||
split = urlsplit(candidate)
|
||||
path = split.path.rstrip("/")
|
||||
|
||||
org_match = _ORG_PATTERN.match(path + "/")
|
||||
if org_match:
|
||||
return urlunsplit(("https", "www.hse.ru", f"/org/persons/{org_match.group(1)}", "", ""))
|
||||
|
||||
staff_match = _STAFF_PATTERN.match(path + "/")
|
||||
if staff_match:
|
||||
return urlunsplit(("https", "www.hse.ru", f"/staff/{staff_match.group(1)}", "", ""))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_profile_identity(profile_url: str) -> tuple[str | None, str | None]:
|
||||
normalized = normalize_profile_url(profile_url)
|
||||
if not normalized:
|
||||
return None, None
|
||||
path = urlsplit(normalized).path.rstrip("/")
|
||||
org_match = _ORG_PATTERN.match(path + "/")
|
||||
if org_match:
|
||||
return "org_person", org_match.group(1)
|
||||
staff_match = _STAFF_PATTERN.match(path + "/")
|
||||
if staff_match:
|
||||
return "staff", staff_match.group(1)
|
||||
return None, None
|
||||
|
||||
|
||||
def profile_key(profile_url: str) -> str | None:
|
||||
profile_type, profile_id = parse_profile_identity(profile_url)
|
||||
if not profile_type or not profile_id:
|
||||
return None
|
||||
return f"{profile_type}:{profile_id}"
|
||||
Reference in New Issue
Block a user