feat: track crawl run employee changes and verify dismissals
This commit is contained in:
@@ -9,7 +9,7 @@ from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import Settings
|
||||
from app.models import CrawlError, CrawlRun, Employee, EmployeeSnapshot, ParserSource, ProfileTab
|
||||
from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParserSource, ProfileTab
|
||||
from app.parser.collector import collect_profile_links
|
||||
from app.parser.profile import parse_person_profile
|
||||
from app.parser.profile_url import profile_key
|
||||
@@ -68,7 +68,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
|
||||
finally:
|
||||
time.sleep(settings.request_delay_seconds)
|
||||
|
||||
run.dismissed_count = _mark_dismissed(db, found_keys)
|
||||
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
|
||||
run.status = "completed"
|
||||
except Exception as exc:
|
||||
run.status = "failed"
|
||||
@@ -107,6 +107,9 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
|
||||
)
|
||||
db.add(employee)
|
||||
run.new_count += 1
|
||||
is_new = True
|
||||
else:
|
||||
is_new = False
|
||||
|
||||
employee.full_name = parsed.get("full_name")
|
||||
employee.status = "active"
|
||||
@@ -117,6 +120,16 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
|
||||
employee.current_checksum = checksum
|
||||
db.flush()
|
||||
|
||||
if is_new:
|
||||
_record_employee_change(
|
||||
db,
|
||||
run,
|
||||
employee,
|
||||
"new",
|
||||
profile_available=True,
|
||||
message="Сотрудник впервые найден в источнике.",
|
||||
)
|
||||
|
||||
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
|
||||
for tab in parsed.get("tabs") or []:
|
||||
db.add(
|
||||
@@ -141,20 +154,70 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
|
||||
return employee
|
||||
|
||||
|
||||
def _mark_dismissed(db: Session, found_keys: set[str]) -> int:
|
||||
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
|
||||
dismissed = 0
|
||||
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
|
||||
now = datetime.now(timezone.utc)
|
||||
for employee in active:
|
||||
if employee.profile_key in found_keys:
|
||||
continue
|
||||
profile_available = _profile_is_available(session, employee.canonical_url, timeout)
|
||||
if profile_available:
|
||||
_record_employee_change(
|
||||
db,
|
||||
run,
|
||||
employee,
|
||||
"missing_from_source",
|
||||
profile_available=True,
|
||||
message="Профиль доступен, но ссылка отсутствует в исходном списке.",
|
||||
)
|
||||
continue
|
||||
employee.status = "dismissed"
|
||||
employee.dismissed_at = now
|
||||
_record_employee_change(
|
||||
db,
|
||||
run,
|
||||
employee,
|
||||
"dismissed",
|
||||
profile_available=False,
|
||||
message="Сотрудник отсутствует в исходном списке, профиль не подтвердился как доступный.",
|
||||
)
|
||||
dismissed += 1
|
||||
db.commit()
|
||||
return dismissed
|
||||
|
||||
|
||||
def _profile_is_available(session: requests.Session, url: str, timeout: int) -> bool:
|
||||
try:
|
||||
response = session.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
|
||||
return response.status_code < 400
|
||||
except requests.RequestException:
|
||||
return False
|
||||
|
||||
|
||||
def _record_employee_change(
|
||||
db: Session,
|
||||
run: CrawlRun,
|
||||
employee: Employee,
|
||||
change_type: str,
|
||||
*,
|
||||
profile_available: bool | None,
|
||||
message: str,
|
||||
) -> None:
|
||||
db.add(
|
||||
CrawlRunEmployeeChange(
|
||||
crawl_run_id=run.id,
|
||||
employee_id=employee.id,
|
||||
profile_key=employee.profile_key,
|
||||
profile_url=employee.canonical_url,
|
||||
full_name=employee.full_name,
|
||||
change_type=change_type,
|
||||
profile_available=profile_available,
|
||||
message=message,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _checksum(data: dict) -> str:
|
||||
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user