114 lines
3.5 KiB
Python
114 lines
3.5 KiB
Python
from datetime import datetime, timezone
|
|
|
|
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
|
|
from app.services.crawler import _mark_dismissed, _upsert_employee
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, status_code):
|
|
self.status_code = status_code
|
|
|
|
|
|
class FakeSession:
|
|
def __init__(self, statuses):
|
|
self.statuses = statuses
|
|
|
|
def get(self, url, **_kwargs):
|
|
return FakeResponse(self.statuses[url])
|
|
|
|
|
|
def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session):
|
|
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
|
db_session.add(run)
|
|
db_session.add(
|
|
Employee(
|
|
profile_key="staff:kept",
|
|
canonical_url="https://www.hse.ru/staff/kept",
|
|
status="active",
|
|
first_seen_at=datetime.now(timezone.utc),
|
|
last_seen_at=datetime.now(timezone.utc),
|
|
)
|
|
)
|
|
db_session.add(
|
|
Employee(
|
|
profile_key="staff:missing",
|
|
canonical_url="https://www.hse.ru/staff/missing",
|
|
status="active",
|
|
first_seen_at=datetime.now(timezone.utc),
|
|
last_seen_at=datetime.now(timezone.utc),
|
|
)
|
|
)
|
|
db_session.commit()
|
|
|
|
dismissed = _mark_dismissed(
|
|
db_session,
|
|
run,
|
|
{"staff:kept"},
|
|
FakeSession({"https://www.hse.ru/staff/missing": 200}),
|
|
30,
|
|
)
|
|
|
|
assert dismissed == 0
|
|
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
|
|
missing = db_session.query(Employee).filter_by(profile_key="staff:missing").one()
|
|
assert missing.status == "active"
|
|
assert missing.dismissed_at is None
|
|
change = db_session.query(CrawlRunEmployeeChange).one()
|
|
assert change.change_type == "missing_from_source"
|
|
assert change.profile_available is True
|
|
|
|
|
|
def test_mark_dismissed_marks_missing_employee_when_profile_is_unavailable(db_session):
|
|
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
|
employee = Employee(
|
|
profile_key="staff:gone",
|
|
canonical_url="https://www.hse.ru/staff/gone",
|
|
status="active",
|
|
first_seen_at=datetime.now(timezone.utc),
|
|
last_seen_at=datetime.now(timezone.utc),
|
|
)
|
|
db_session.add_all([run, employee])
|
|
db_session.commit()
|
|
|
|
dismissed = _mark_dismissed(
|
|
db_session,
|
|
run,
|
|
set(),
|
|
FakeSession({"https://www.hse.ru/staff/gone": 404}),
|
|
30,
|
|
)
|
|
|
|
assert dismissed == 1
|
|
assert employee.status == "dismissed"
|
|
assert employee.dismissed_at is not None
|
|
change = db_session.query(CrawlRunEmployeeChange).one()
|
|
assert change.change_type == "dismissed"
|
|
assert change.profile_available is False
|
|
|
|
|
|
def test_upsert_employee_increments_new_count_and_records_change_for_new_employee(db_session):
|
|
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
|
db_session.add(run)
|
|
db_session.commit()
|
|
|
|
_upsert_employee(
|
|
db_session,
|
|
run,
|
|
{
|
|
"source_url": "https://www.hse.ru/staff/newperson",
|
|
"profile_type": "staff",
|
|
"profile_id": "newperson",
|
|
"full_name": "New Person",
|
|
"tabs": [],
|
|
"sections": [],
|
|
"parser_version": "0.2.0",
|
|
"_html": "<html></html>",
|
|
},
|
|
)
|
|
db_session.commit()
|
|
|
|
assert run.new_count == 1
|
|
change = db_session.query(CrawlRunEmployeeChange).one()
|
|
assert change.change_type == "new"
|
|
assert change.full_name == "New Person"
|