feat: adds crawl resource cache
This commit is contained in:
147
app/services/resource_cache.py
Normal file
147
app/services/resource_cache.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import ParseResourceCache
|
||||
from app.version import BACKEND_VERSION
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CachedResource:
|
||||
text: str
|
||||
body_hash: str
|
||||
from_cache: bool
|
||||
status_code: int
|
||||
|
||||
|
||||
class ResourceCache:
|
||||
def __init__(self, db: Session):
|
||||
self.db = db
|
||||
|
||||
def fetch_text(
|
||||
self,
|
||||
session: requests.Session,
|
||||
*,
|
||||
profile_key: str,
|
||||
resource_key: str,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
json_payload: Any | None = None,
|
||||
params: dict[str, Any] | None = None,
|
||||
) -> CachedResource:
|
||||
method = method.upper()
|
||||
fingerprint = _request_fingerprint(method=method, url=url, json_payload=json_payload, params=params)
|
||||
cached = self.db.scalar(
|
||||
select(ParseResourceCache).where(
|
||||
ParseResourceCache.profile_key == profile_key,
|
||||
ParseResourceCache.resource_key == resource_key,
|
||||
ParseResourceCache.request_fingerprint == fingerprint,
|
||||
)
|
||||
)
|
||||
|
||||
request_headers = dict(headers)
|
||||
if cached:
|
||||
if cached.etag:
|
||||
request_headers["If-None-Match"] = cached.etag
|
||||
if cached.last_modified:
|
||||
request_headers["If-Modified-Since"] = cached.last_modified
|
||||
|
||||
response = _send(
|
||||
session,
|
||||
method=method,
|
||||
url=url,
|
||||
headers=request_headers,
|
||||
timeout=timeout,
|
||||
json_payload=json_payload,
|
||||
params=params,
|
||||
)
|
||||
if response.status_code == 304 and cached:
|
||||
cached.fetched_at = datetime.now(timezone.utc)
|
||||
self.db.flush()
|
||||
return CachedResource(
|
||||
text=gzip.decompress(cached.body_snapshot).decode("utf-8"),
|
||||
body_hash=cached.body_hash,
|
||||
from_cache=True,
|
||||
status_code=response.status_code,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
text = response.text
|
||||
body_hash = _body_hash(text)
|
||||
etag = response.headers.get("ETag") if hasattr(response, "headers") else None
|
||||
last_modified = response.headers.get("Last-Modified") if hasattr(response, "headers") else None
|
||||
|
||||
if cached:
|
||||
cached.method = method
|
||||
cached.url = url
|
||||
cached.etag = etag
|
||||
cached.last_modified = last_modified
|
||||
cached.body_hash = body_hash
|
||||
cached.body_snapshot = gzip.compress(text.encode("utf-8"))
|
||||
cached.parser_version = BACKEND_VERSION
|
||||
cached.fetched_at = datetime.now(timezone.utc)
|
||||
else:
|
||||
self.db.add(
|
||||
ParseResourceCache(
|
||||
profile_key=profile_key,
|
||||
resource_key=resource_key,
|
||||
method=method,
|
||||
url=url,
|
||||
request_fingerprint=fingerprint,
|
||||
etag=etag,
|
||||
last_modified=last_modified,
|
||||
body_hash=body_hash,
|
||||
body_snapshot=gzip.compress(text.encode("utf-8")),
|
||||
parser_version=BACKEND_VERSION,
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
)
|
||||
)
|
||||
self.db.flush()
|
||||
return CachedResource(text=text, body_hash=body_hash, from_cache=False, status_code=response.status_code)
|
||||
|
||||
|
||||
def _send(
|
||||
session: requests.Session,
|
||||
*,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
json_payload: Any | None,
|
||||
params: dict[str, Any] | None,
|
||||
) -> requests.Response:
|
||||
if method == "POST":
|
||||
return session.post(url, json=json_payload, headers=headers, timeout=timeout, params=params)
|
||||
return session.get(url, headers=headers, timeout=timeout, params=params)
|
||||
|
||||
|
||||
def _request_fingerprint(
|
||||
*,
|
||||
method: str,
|
||||
url: str,
|
||||
json_payload: Any | None,
|
||||
params: dict[str, Any] | None,
|
||||
) -> str:
|
||||
payload = {
|
||||
"method": method,
|
||||
"url": url,
|
||||
"json": json_payload,
|
||||
"params": params,
|
||||
}
|
||||
encoded = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
||||
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _body_hash(text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
Reference in New Issue
Block a user