from __future__ import annotations import gzip import hashlib import json from dataclasses import dataclass from datetime import datetime, timezone from typing import Any import requests from sqlalchemy import select from sqlalchemy.orm import Session from app.models import ParseResourceCache from app.version import BACKEND_VERSION @dataclass(frozen=True) class CachedResource: text: str body_hash: str from_cache: bool status_code: int class ResourceCache: def __init__(self, db: Session): self.db = db def fetch_text( self, session: requests.Session, *, profile_key: str, resource_key: str, method: str, url: str, headers: dict[str, str], timeout: int, json_payload: Any | None = None, params: dict[str, Any] | None = None, ) -> CachedResource: method = method.upper() fingerprint = _request_fingerprint(method=method, url=url, json_payload=json_payload, params=params) cached = self.db.scalar( select(ParseResourceCache).where( ParseResourceCache.profile_key == profile_key, ParseResourceCache.resource_key == resource_key, ParseResourceCache.request_fingerprint == fingerprint, ) ) request_headers = dict(headers) if cached: if cached.etag: request_headers["If-None-Match"] = cached.etag if cached.last_modified: request_headers["If-Modified-Since"] = cached.last_modified response = _send( session, method=method, url=url, headers=request_headers, timeout=timeout, json_payload=json_payload, params=params, ) if response.status_code == 304 and cached: cached.fetched_at = datetime.now(timezone.utc) self.db.flush() return CachedResource( text=gzip.decompress(cached.body_snapshot).decode("utf-8"), body_hash=cached.body_hash, from_cache=True, status_code=response.status_code, ) response.raise_for_status() text = response.text body_hash = _body_hash(text) etag = response.headers.get("ETag") if hasattr(response, "headers") else None last_modified = response.headers.get("Last-Modified") if hasattr(response, "headers") else None if cached: cached.method = method cached.url = url cached.etag = etag cached.last_modified = last_modified cached.body_hash = body_hash cached.body_snapshot = gzip.compress(text.encode("utf-8")) cached.parser_version = BACKEND_VERSION cached.fetched_at = datetime.now(timezone.utc) else: self.db.add( ParseResourceCache( profile_key=profile_key, resource_key=resource_key, method=method, url=url, request_fingerprint=fingerprint, etag=etag, last_modified=last_modified, body_hash=body_hash, body_snapshot=gzip.compress(text.encode("utf-8")), parser_version=BACKEND_VERSION, fetched_at=datetime.now(timezone.utc), ) ) self.db.flush() return CachedResource(text=text, body_hash=body_hash, from_cache=False, status_code=response.status_code) def _send( session: requests.Session, *, method: str, url: str, headers: dict[str, str], timeout: int, json_payload: Any | None, params: dict[str, Any] | None, ) -> requests.Response: if method == "POST": return session.post(url, json=json_payload, headers=headers, timeout=timeout, params=params) return session.get(url, headers=headers, timeout=timeout, params=params) def _request_fingerprint( *, method: str, url: str, json_payload: Any | None, params: dict[str, Any] | None, ) -> str: payload = { "method": method, "url": url, "json": json_payload, "params": params, } encoded = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":")) return hashlib.sha256(encoded.encode("utf-8")).hexdigest() def _body_hash(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest()