feat: adds crawl resource cache
This commit is contained in:
@@ -200,13 +200,14 @@ def test_run_payload_calculates_progress():
|
||||
status="running",
|
||||
found_count=10,
|
||||
parsed_count=4,
|
||||
skipped_count=2,
|
||||
error_count=1,
|
||||
)
|
||||
|
||||
payload = run_payload(run)
|
||||
|
||||
assert payload["processed_count"] == 5
|
||||
assert payload["progress_percent"] == 50.0
|
||||
assert payload["processed_count"] == 7
|
||||
assert payload["progress_percent"] == 70.0
|
||||
assert payload["status_display"] == "Выполняется"
|
||||
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ def test_health_returns_versions():
|
||||
response = client.get("/api/health")
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()["backend_version"] == "0.5.0"
|
||||
assert response.json()["backend_version"] == "0.6.0"
|
||||
|
||||
|
||||
def test_mcp_lists_tools_without_auth_and_ignores_auth_header():
|
||||
@@ -154,7 +154,7 @@ def test_mcp_service_info_returns_tools_and_dataset_hash():
|
||||
assert response.status_code == 200
|
||||
payload = json.loads(response.json()["result"]["content"][0]["text"])
|
||||
assert payload["service_name"] == "miem-employees"
|
||||
assert payload["backend_version"] == "0.5.0"
|
||||
assert payload["backend_version"] == "0.6.0"
|
||||
assert payload["dataset"]["hash"]
|
||||
assert any(tool["name"] == "sync_employees" for tool in payload["tools"])
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import gzip
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee
|
||||
from app.services.crawler import _mark_dismissed, _upsert_employee
|
||||
from app.models import CrawlRun, CrawlRunEmployeeChange, Employee, EmployeeSnapshot, ParseResourceCache
|
||||
from app.services.crawler import _checksum, _mark_dismissed, _upsert_employee
|
||||
from app.services.resource_cache import ResourceCache
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
@@ -17,6 +19,34 @@ class FakeSession:
|
||||
return FakeResponse(self.statuses[url])
|
||||
|
||||
|
||||
class ConditionalResponse:
|
||||
def __init__(self, status_code, text="", headers=None):
|
||||
self.status_code = status_code
|
||||
self._text = text
|
||||
self.headers = headers or {}
|
||||
self.text_read = False
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
self.text_read = True
|
||||
return self._text
|
||||
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
|
||||
class ConditionalSession:
|
||||
def __init__(self):
|
||||
self.requests = []
|
||||
self.not_modified_response = ConditionalResponse(304)
|
||||
|
||||
def get(self, url, **kwargs):
|
||||
self.requests.append((url, kwargs))
|
||||
if kwargs["headers"].get("If-None-Match") == '"cached"':
|
||||
return self.not_modified_response
|
||||
return ConditionalResponse(200, "fresh", {"ETag": '"fresh"'})
|
||||
|
||||
|
||||
def test_mark_dismissed_records_missing_source_when_profile_is_available(db_session):
|
||||
run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
db_session.add(run)
|
||||
@@ -111,3 +141,86 @@ def test_upsert_employee_increments_new_count_and_records_change_for_new_employe
|
||||
change = db_session.query(CrawlRunEmployeeChange).one()
|
||||
assert change.change_type == "new"
|
||||
assert change.full_name == "New Person"
|
||||
|
||||
|
||||
def test_resource_cache_uses_etag_and_reuses_cached_body_on_304(db_session):
|
||||
db_session.add(
|
||||
ParseResourceCache(
|
||||
profile_key="staff:cached",
|
||||
resource_key="main-html",
|
||||
method="GET",
|
||||
url="https://www.hse.ru/staff/cached",
|
||||
request_fingerprint="020d59db7b358d9023d0f185bcbf5a9c085d3cf2bf91d92d48eee9147e8d0f01",
|
||||
etag='"cached"',
|
||||
body_hash="cached-hash",
|
||||
body_snapshot=gzip.compress("cached body".encode("utf-8")),
|
||||
parser_version="0.6.0",
|
||||
)
|
||||
)
|
||||
db_session.commit()
|
||||
session = ConditionalSession()
|
||||
|
||||
result = ResourceCache(db_session).fetch_text(
|
||||
session,
|
||||
profile_key="staff:cached",
|
||||
resource_key="main-html",
|
||||
method="GET",
|
||||
url="https://www.hse.ru/staff/cached",
|
||||
headers={"User-Agent": "test"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert session.requests[0][1]["headers"]["If-None-Match"] == '"cached"'
|
||||
assert result.text == "cached body"
|
||||
assert result.from_cache is True
|
||||
assert session.not_modified_response.text_read is False
|
||||
|
||||
|
||||
def test_upsert_employee_skips_snapshot_when_checksum_is_unchanged(db_session):
|
||||
first_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
second_run = CrawlRun(source_url="https://miem.hse.ru/persons", status="running")
|
||||
db_session.add_all([first_run, second_run])
|
||||
db_session.commit()
|
||||
|
||||
_, first_changed = _upsert_employee(db_session, first_run, _parsed_employee("same"))
|
||||
_, second_changed = _upsert_employee(db_session, second_run, _parsed_employee("same"))
|
||||
db_session.commit()
|
||||
|
||||
assert first_changed is True
|
||||
assert second_changed is False
|
||||
assert db_session.query(EmployeeSnapshot).count() == 1
|
||||
|
||||
|
||||
def test_checksum_changes_when_widget_data_changes():
|
||||
base = _parsed_employee("widgets")
|
||||
changed = _parsed_employee("widgets")
|
||||
changed["sections"] = [
|
||||
{
|
||||
"type": "publications",
|
||||
"publications": [{"id": "1", "title": "New publication"}],
|
||||
}
|
||||
]
|
||||
|
||||
assert _checksum(base) != _checksum(changed)
|
||||
|
||||
|
||||
def test_checksum_ignores_date_dependent_experience_text():
|
||||
first = _parsed_employee("experience")
|
||||
second = _parsed_employee("experience")
|
||||
first["sections"] = [{"raw_text": "Стаж работы в НИУ ВШЭ: 5 лет"}]
|
||||
second["sections"] = [{"raw_text": "Стаж работы в НИУ ВШЭ: 6 лет"}]
|
||||
|
||||
assert _checksum(first) == _checksum(second)
|
||||
|
||||
|
||||
def _parsed_employee(profile_id: str) -> dict:
|
||||
return {
|
||||
"source_url": f"https://www.hse.ru/staff/{profile_id}",
|
||||
"profile_type": "staff",
|
||||
"profile_id": profile_id,
|
||||
"full_name": "Same Person",
|
||||
"tabs": [],
|
||||
"sections": [],
|
||||
"parser_version": "0.6.0",
|
||||
"_html": "<html></html>",
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user