feat: adds crawl resource cache

This commit is contained in:
Anton
2026-05-14 12:21:44 +03:00
parent 5180b89b81
commit 6724b3f369
20 changed files with 1192 additions and 73 deletions

View File

@@ -1,6 +1,7 @@
import gzip
import hashlib
import json
import re
import time
from datetime import datetime, timezone
@@ -14,6 +15,7 @@ from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key
from app.services.dataset_versions import get_or_create_current_version
from app.services.resource_cache import ResourceCache
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
@@ -29,8 +31,10 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
found_keys: set[str] = set()
parsed_count = 0
skipped_count = 0
try:
with requests.Session() as session:
resource_cache = ResourceCache(db)
urls = collect_profile_links(session, source.source_url, HEADERS, settings.request_timeout)
if settings.crawl_limit:
urls = urls[: settings.crawl_limit]
@@ -48,12 +52,17 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
HEADERS,
settings.request_timeout,
settings.parser_use_playwright,
resource_cache=resource_cache,
)
if not parsed:
continue
_upsert_employee(db, run, parsed)
parsed_count += 1
_, changed = _upsert_employee(db, run, parsed)
if changed:
parsed_count += 1
else:
skipped_count += 1
run.parsed_count = parsed_count
run.skipped_count = skipped_count
db.commit()
except Exception as exc:
run.error_count += 1
@@ -69,7 +78,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
finally:
time.sleep(settings.request_delay_seconds)
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.status = "completed"
get_or_create_current_version(db, crawl_run_id=run.id)
except Exception as exc:
@@ -90,20 +99,25 @@ def refresh_employee(db: Session, employee: Employee, settings: Settings) -> Cra
try:
with requests.Session() as session:
resource_cache = ResourceCache(db)
parsed = parse_person_profile(
session,
employee.canonical_url,
HEADERS,
settings.request_timeout,
settings.parser_use_playwright,
resource_cache=resource_cache,
)
if not parsed:
raise ValueError("Профиль не удалось распарсить.")
if _parsed_profile_key(parsed) != employee.profile_key:
raise ValueError("Распарсенный профиль не совпадает с обновляемым сотрудником.")
_upsert_employee(db, run, parsed)
run.parsed_count = 1
_, changed = _upsert_employee(db, run, parsed)
if changed:
run.parsed_count = 1
else:
run.skipped_count = 1
run.status = "completed"
get_or_create_current_version(db, crawl_run_id=run.id)
except Exception as exc:
@@ -140,8 +154,9 @@ def _parsed_profile_key(parsed: dict) -> str:
return f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> tuple[Employee, bool]:
html = parsed.pop("_html", None)
parsed.pop("_resource_manifest", None)
checksum = _checksum(parsed)
key = _parsed_profile_key(parsed)
employee = db.scalar(select(Employee).where(Employee.profile_key == key))
@@ -160,12 +175,15 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
else:
is_new = False
parser_version = parsed.get("parser_version")
changed = is_new or employee.current_checksum != checksum or employee.parser_version != parser_version
employee.full_name = parsed.get("full_name")
employee.status = "active"
employee.last_seen_at = now
employee.dismissed_at = None
employee.parser_version = parsed.get("parser_version")
employee.current_data = parsed
employee.parser_version = parser_version
if changed:
employee.current_data = parsed
employee.current_checksum = checksum
db.flush()
@@ -179,28 +197,29 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
message="Сотрудник впервые найден в источнике.",
)
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
if changed:
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
db.add(
ProfileTab(
employee_id=employee.id,
title=tab.get("title") or "",
href=tab.get("href") or "",
data_index=tab.get("data_index"),
)
)
db.add(
ProfileTab(
EmployeeSnapshot(
employee_id=employee.id,
title=tab.get("title") or "",
href=tab.get("href") or "",
data_index=tab.get("data_index"),
crawl_run_id=run.id,
parsed_data=parsed,
html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
checksum=checksum,
parser_version=parser_version,
)
)
db.add(
EmployeeSnapshot(
employee_id=employee.id,
crawl_run_id=run.id,
parsed_data=parsed,
html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
checksum=checksum,
parser_version=parsed.get("parser_version"),
)
)
return employee
return employee, changed
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
@@ -268,5 +287,23 @@ def _record_employee_change(
def _checksum(data: dict) -> str:
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
payload = json.dumps(_stable_checksum_payload(data), ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
def _stable_checksum_payload(value):
if isinstance(value, dict):
return {key: _stable_checksum_payload(item) for key, item in value.items()}
if isinstance(value, list):
return [_stable_checksum_payload(item) for item in value]
if isinstance(value, str):
return _normalize_date_dependent_experience(value)
return value
def _normalize_date_dependent_experience(value: str) -> str:
return re.sub(
r"(?i)(стаж(?:\s+работы)?(?:\s+в\s+ниу\s+вшэ|\s+в\s+вшэ)?\s*:?\s*)\d+\s*(?:год(?:а|ов)?|лет)",
r"\1<experience-years>",
value,
)