feat: track crawl run employee changes and verify dismissals

This commit is contained in:
Anton
2026-05-06 15:13:15 +03:00
parent 2331c7a28d
commit d0459a2c30
16 changed files with 517 additions and 27 deletions

View File

@@ -1,10 +1,25 @@
from datetime import datetime, timezone
from app.models import CrawlRun, Employee
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
from app.services.crawler import _mark_dismissed, _upsert_employee
def test_mark_dismissed_only_marks_missing_active_employees(db_session):
class FakeResponse:
def __init__(self, status_code):
self.status_code = status_code
class FakeSession:
def __init__(self, statuses):
self.statuses = statuses
def get(self, url, **_kwargs):
return FakeResponse(self.statuses[url])
def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.add(
Employee(
profile_key="staff:kept",
@@ -16,8 +31,8 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
)
db_session.add(
Employee(
profile_key="staff:gone",
canonical_url="https://www.hse.ru/staff/gone",
profile_key="staff:missing",
canonical_url="https://www.hse.ru/staff/missing",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
@@ -25,16 +40,53 @@ def test_mark_dismissed_only_marks_missing_active_employees(db_session):
)
db_session.commit()
dismissed = _mark_dismissed(db_session, {"staff:kept"})
dismissed = _mark_dismissed(
db_session,
run,
{"staff:kept"},
FakeSession({"https://www.hse.ru/staff/missing": 200}),
30,
)
assert dismissed == 0
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
missing = db_session.query(Employee).filter_by(profile_key="staff:missing").one()
assert missing.status == "active"
assert missing.dismissed_at is None
change = db_session.query(CrawlRunEmployeeChange).one()
assert change.change_type == "missing_from_source"
assert change.profile_available is True
def test_mark_dismissed_marks_missing_employee_when_profile_is_unavailable(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
employee = Employee(
profile_key="staff:gone",
canonical_url="https://www.hse.ru/staff/gone",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
db_session.add_all([run, employee])
db_session.commit()
dismissed = _mark_dismissed(
db_session,
run,
set(),
FakeSession({"https://www.hse.ru/staff/gone": 404}),
30,
)
assert dismissed == 1
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
assert gone.status == "dismissed"
assert gone.dismissed_at is not None
assert employee.status == "dismissed"
assert employee.dismissed_at is not None
change = db_session.query(CrawlRunEmployeeChange).one()
assert change.change_type == "dismissed"
assert change.profile_available is False
def test_upsert_employee_increments_new_count_for_new_employee(db_session):
def test_upsert_employee_increments_new_count_and_records_change_for_new_employee(db_session):
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
db_session.add(run)
db_session.commit()
@@ -56,3 +108,6 @@ def test_upsert_employee_increments_new_count_for_new_employee(db_session):
db_session.commit()
assert run.new_count == 1
change = db_session.query(CrawlRunEmployeeChange).one()
assert change.change_type == "new"
assert change.full_name == "New Person"