Files
miem_workers/app/services/crawler.py

224 lines
7.2 KiB
Python

import gzip
import hashlib
import json
import time
from datetime import datetime, timezone
import requests
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import Settings
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab
from app.parser.collector import collect_profile_links
from app.parser.profile import parse_person_profile
from app.parser.profile_url import profile_key
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
}
def run_crawl(db: Session, settings: Settings) -> CrawlRun:
source = _ensure_source(db, settings.source_url)
run = CrawlRun(source_url=source.source_url, status="running")
db.add(run)
db.commit()
db.refresh(run)
found_keys: set[str] = set()
parsed_count = 0
try:
with requests.Session() as session:
urls = collect_profile_links(session, source.source_url, HEADERS, settings.request_timeout)
if settings.crawl_limit:
urls = urls[: settings.crawl_limit]
run.found_count = len(urls)
db.commit()
for url in urls:
key = profile_key(url)
if key:
found_keys.add(key)
try:
parsed = parse_person_profile(
session,
url,
HEADERS,
settings.request_timeout,
settings.parser_use_playwright,
)
if not parsed:
continue
_upsert_employee(db, run, parsed)
parsed_count += 1
run.parsed_count = parsed_count
db.commit()
except Exception as exc:
run.error_count += 1
db.add(
CrawlError(
crawl_run_id=run.id,
profile_url=url,
error_type=type(exc).__name__,
message=str(exc),
)
)
db.commit()
finally:
time.sleep(settings.request_delay_seconds)
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
run.status = "completed"
except Exception as exc:
run.status = "failed"
run.message = str(exc)
finally:
run.finished_at = datetime.now(timezone.utc)
db.commit()
db.refresh(run)
return run
def _ensure_source(db: Session, source_url: str) -> ParserSource:
source = db.scalar(select(ParserSource).where(ParserSource.source_url == source_url))
if source:
return source
source = ParserSource(source_url=source_url, enabled=True)
db.add(source)
db.commit()
db.refresh(source)
return source
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
html = parsed.pop("_html", None)
checksum = _checksum(parsed)
key = f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
employee = db.scalar(select(Employee).where(Employee.profile_key == key))
now = datetime.now(timezone.utc)
if not employee:
employee = Employee(
profile_key=key,
profile_type=parsed.get("profile_type"),
profile_id=parsed.get("profile_id"),
canonical_url=parsed["source_url"],
first_seen_at=now,
)
db.add(employee)
run.new_count += 1
is_new = True
else:
is_new = False
employee.full_name = parsed.get("full_name")
employee.status = "active"
employee.last_seen_at = now
employee.dismissed_at = None
employee.parser_version = parsed.get("parser_version")
employee.current_data = parsed
employee.current_checksum = checksum
db.flush()
if is_new:
_record_employee_change(
db,
run,
employee,
"new",
profile_available=True,
message="Сотрудник впервые найден в источнике.",
)
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
for tab in parsed.get("tabs") or []:
db.add(
ProfileTab(
employee_id=employee.id,
title=tab.get("title") or "",
href=tab.get("href") or "",
data_index=tab.get("data_index"),
)
)
db.add(
EmployeeSnapshot(
employee_id=employee.id,
crawl_run_id=run.id,
parsed_data=parsed,
html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
checksum=checksum,
parser_version=parsed.get("parser_version"),
)
)
return employee
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
dismissed = 0
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
now = datetime.now(timezone.utc)
for employee in active:
if employee.profile_key in found_keys:
continue
profile_available = _profile_is_available(session, employee.canonical_url, timeout)
if profile_available:
_record_employee_change(
db,
run,
employee,
"missing_from_source",
profile_available=True,
message="Профиль доступен, но ссылка отсутствует в исходном списке.",
)
continue
employee.status = "dismissed"
employee.dismissed_at = now
_record_employee_change(
db,
run,
employee,
"dismissed",
profile_available=False,
message="Сотрудник отсутствует в исходном списке, профиль не подтвердился как доступный.",
)
dismissed += 1
db.commit()
return dismissed
def _profile_is_available(session: requests.Session, url: str, timeout: int) -> bool:
try:
response = session.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
return response.status_code < 400
except requests.RequestException:
return False
def _record_employee_change(
db: Session,
run: CrawlRun,
employee: Employee,
change_type: str,
*,
profile_available: bool | None,
message: str,
) -> None:
db.add(
CrawlRunEmployeeChange(
crawl_run_id=run.id,
employee_id=employee.id,
profile_key=employee.profile_key,
profile_url=employee.canonical_url,
full_name=employee.full_name,
change_type=change_type,
profile_available=profile_available,
message=message,
)
)
def _checksum(data: dict) -> str:
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()