feat: adds crawl resource cache
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
@@ -149,22 +151,42 @@ def parse_person_profile(
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
use_playwright: bool = False,
|
||||
resource_cache=None,
|
||||
) -> dict | None:
|
||||
normalized_url = normalize_profile_url(source_url)
|
||||
if not normalized_url:
|
||||
return None
|
||||
response = session.get(normalized_url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
profile_type, profile_id = parse_profile_identity(normalized_url)
|
||||
cache_profile_key = f"{profile_type}:{profile_id}"
|
||||
resource_manifest = []
|
||||
html = _fetch_text(
|
||||
session,
|
||||
normalized_url,
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=cache_profile_key,
|
||||
resource_key="main-html",
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
if use_playwright:
|
||||
html = _render_with_playwright(normalized_url, html)
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
profile_type, profile_id = parse_profile_identity(normalized_url)
|
||||
header = extract_person_header(soup, normalized_url)
|
||||
tabs = extract_person_tabs(soup, normalized_url)
|
||||
sections = extract_sections(soup, normalized_url)
|
||||
sections = enrich_sections_from_hse_widgets(session, soup, normalized_url, headers, timeout, sections)
|
||||
sections = enrich_sections_from_hse_widgets(
|
||||
session,
|
||||
soup,
|
||||
normalized_url,
|
||||
headers,
|
||||
timeout,
|
||||
sections,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=cache_profile_key,
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
internal_links = [tab["href"] for tab in tabs if tab.get("href")]
|
||||
|
||||
return {
|
||||
@@ -181,6 +203,7 @@ def parse_person_profile(
|
||||
"employee_internal_links": internal_links,
|
||||
"parser_version": BACKEND_VERSION,
|
||||
"_html": html,
|
||||
"_resource_manifest": resource_manifest,
|
||||
}
|
||||
|
||||
|
||||
@@ -191,13 +214,33 @@ def enrich_sections_from_hse_widgets(
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
sections: list[dict],
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_manifest: list[dict] | None = None,
|
||||
) -> list[dict]:
|
||||
enriched = list(sections)
|
||||
publications = _load_widget_publications(session, soup, headers, timeout)
|
||||
publications = _load_widget_publications(
|
||||
session,
|
||||
soup,
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
if publications:
|
||||
enriched = _upsert_publications_section(enriched, publications)
|
||||
|
||||
theses = _load_widget_graduation_theses(session, soup, source_url, headers, timeout)
|
||||
theses = _load_widget_graduation_theses(
|
||||
session,
|
||||
soup,
|
||||
source_url,
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
if theses:
|
||||
enriched = _upsert_graduation_theses_section(enriched, theses)
|
||||
return enriched
|
||||
@@ -226,7 +269,16 @@ def _render_with_playwright(source_url: str, fallback_html: str) -> str:
|
||||
return fallback_html
|
||||
|
||||
|
||||
def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: dict[str, str], timeout: int) -> list[dict]:
|
||||
def _load_widget_publications(
|
||||
session: Session,
|
||||
soup: BeautifulSoup,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
*,
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_manifest: list[dict] | None = None,
|
||||
) -> list[dict]:
|
||||
script = soup.select_one('script[data-widget-name="AuthorSearch"][data-author]')
|
||||
if not script:
|
||||
return []
|
||||
@@ -251,14 +303,29 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di
|
||||
},
|
||||
}
|
||||
try:
|
||||
response = session.post(
|
||||
"https://publications.hse.ru/api/searchPubs",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
if resource_cache and profile_key:
|
||||
text = _fetch_text(
|
||||
session,
|
||||
"https://publications.hse.ru/api/searchPubs",
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_key=f"publications-page-{page_id}",
|
||||
resource_manifest=resource_manifest,
|
||||
method="POST",
|
||||
json_payload=payload,
|
||||
)
|
||||
data = json.loads(text)
|
||||
else:
|
||||
response = session.post(
|
||||
"https://publications.hse.ru/api/searchPubs",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except Exception:
|
||||
return publications
|
||||
|
||||
@@ -309,6 +376,10 @@ def _load_widget_graduation_theses(
|
||||
source_url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
*,
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_manifest: list[dict] | None = None,
|
||||
) -> list[dict]:
|
||||
script = soup.select_one('script[src*="/n/stat/vkr/app.js"][data-person-id]')
|
||||
if not script:
|
||||
@@ -320,14 +391,30 @@ def _load_widget_graduation_theses(
|
||||
|
||||
request_headers = {**headers, "x-portal-language": "ru"}
|
||||
try:
|
||||
response = session.get(
|
||||
urljoin(source_url, api_url),
|
||||
params={"supervisorId": person_id},
|
||||
headers=request_headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
url = urljoin(source_url, api_url)
|
||||
params = {"supervisorId": person_id}
|
||||
if resource_cache and profile_key:
|
||||
text = _fetch_text(
|
||||
session,
|
||||
url,
|
||||
request_headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_key="graduation-theses",
|
||||
resource_manifest=resource_manifest,
|
||||
params=params,
|
||||
)
|
||||
data = json.loads(text)
|
||||
else:
|
||||
response = session.get(
|
||||
url,
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
@@ -629,3 +716,62 @@ def _dedupe_dicts(items: list[dict]) -> list[dict]:
|
||||
seen.add(key)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
|
||||
def _fetch_text(
|
||||
session: Session,
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
*,
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_key: str,
|
||||
resource_manifest: list[dict] | None,
|
||||
method: str = "GET",
|
||||
json_payload: object | None = None,
|
||||
params: dict | None = None,
|
||||
) -> str:
|
||||
if resource_cache and profile_key:
|
||||
cached = resource_cache.fetch_text(
|
||||
session,
|
||||
profile_key=profile_key,
|
||||
resource_key=resource_key,
|
||||
method=method,
|
||||
url=url,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
json_payload=json_payload,
|
||||
params=params,
|
||||
)
|
||||
if resource_manifest is not None:
|
||||
resource_manifest.append(
|
||||
{
|
||||
"resource_key": resource_key,
|
||||
"method": method,
|
||||
"url": url,
|
||||
"body_hash": cached.body_hash,
|
||||
"from_cache": cached.from_cache,
|
||||
"status_code": cached.status_code,
|
||||
}
|
||||
)
|
||||
return cached.text
|
||||
|
||||
if method.upper() == "POST":
|
||||
response = session.post(url, json=json_payload, headers=headers, timeout=timeout, params=params)
|
||||
else:
|
||||
response = session.get(url, headers=headers, timeout=timeout, params=params)
|
||||
response.raise_for_status()
|
||||
text = response.text
|
||||
if resource_manifest is not None:
|
||||
resource_manifest.append(
|
||||
{
|
||||
"resource_key": resource_key,
|
||||
"method": method,
|
||||
"url": url,
|
||||
"body_hash": hashlib.sha256(text.encode("utf-8")).hexdigest(),
|
||||
"from_cache": False,
|
||||
"status_code": response.status_code,
|
||||
}
|
||||
)
|
||||
return text
|
||||
|
||||
Reference in New Issue
Block a user