from datetime import datetime, timezone from app.models import CrawlRun, CrawlRunEmployeeChange, Employee from app.services.crawler import _mark_dismissed, _upsert_employee class FakeResponse: def __init__(self, status_code): self.status_code = status_code class FakeSession: def __init__(self, statuses): self.statuses = statuses def get(self, url, **_kwargs): return FakeResponse(self.statuses[url]) def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session): run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") db_session.add(run) db_session.add( Employee( profile_key="staff:kept", canonical_url="https://www.hse.ru/staff/kept", status="active", first_seen_at=datetime.now(timezone.utc), last_seen_at=datetime.now(timezone.utc), ) ) db_session.add( Employee( profile_key="staff:missing", canonical_url="https://www.hse.ru/staff/missing", status="active", first_seen_at=datetime.now(timezone.utc), last_seen_at=datetime.now(timezone.utc), ) ) db_session.commit() dismissed = _mark_dismissed( db_session, run, {"staff:kept"}, FakeSession({"https://www.hse.ru/staff/missing": 200}), 30, ) assert dismissed == 0 assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active" missing = db_session.query(Employee).filter_by(profile_key="staff:missing").one() assert missing.status == "active" assert missing.dismissed_at is None change = db_session.query(CrawlRunEmployeeChange).one() assert change.change_type == "missing_from_source" assert change.profile_available is True def test_mark_dismissed_marks_missing_employee_when_profile_is_unavailable(db_session): run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") employee = Employee( profile_key="staff:gone", canonical_url="https://www.hse.ru/staff/gone", status="active", first_seen_at=datetime.now(timezone.utc), last_seen_at=datetime.now(timezone.utc), ) db_session.add_all([run, employee]) db_session.commit() dismissed = _mark_dismissed( db_session, run, set(), FakeSession({"https://www.hse.ru/staff/gone": 404}), 30, ) assert dismissed == 1 assert employee.status == "dismissed" assert employee.dismissed_at is not None change = db_session.query(CrawlRunEmployeeChange).one() assert change.change_type == "dismissed" assert change.profile_available is False def test_upsert_employee_increments_new_count_and_records_change_for_new_employee(db_session): run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running") db_session.add(run) db_session.commit() _upsert_employee( db_session, run, { "source_url": "https://www.hse.ru/staff/newperson", "profile_type": "staff", "profile_id": "newperson", "full_name": "New Person", "tabs": [], "sections": [], "parser_version": "0.2.0", "_html": "", }, ) db_session.commit() assert run.new_count == 1 change = db_session.query(CrawlRunEmployeeChange).one() assert change.change_type == "new" assert change.full_name == "New Person"