feat: adds crawl resource cache
This commit is contained in:
@@ -208,6 +208,7 @@ def _run_payload(run: CrawlRun) -> dict:
|
||||
"finished_at": run.finished_at.isoformat() if run.finished_at else None,
|
||||
"found_count": run.found_count,
|
||||
"parsed_count": run.parsed_count,
|
||||
"skipped_count": run.skipped_count,
|
||||
"error_count": run.error_count,
|
||||
"dismissed_count": run.dismissed_count,
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ class CrawlRun(Base):
|
||||
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
||||
found_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
parsed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
skipped_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
new_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
error_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
@@ -137,6 +138,27 @@ class ParserSource(Base):
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
||||
|
||||
|
||||
class ParseResourceCache(Base):
|
||||
__tablename__ = "parse_resource_cache"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("profile_key", "resource_key", "request_fingerprint", name="uq_parse_resource_cache_resource"),
|
||||
Index("ix_parse_resource_cache_profile_key", "profile_key"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
resource_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
method: Mapped[str] = mapped_column(String(16), nullable=False)
|
||||
url: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
request_fingerprint: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
etag: Mapped[str | None] = mapped_column(Text)
|
||||
last_modified: Mapped[str | None] = mapped_column(Text)
|
||||
body_hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
body_snapshot: Mapped[bytes] = mapped_column(LargeBinary, nullable=False)
|
||||
parser_version: Mapped[str | None] = mapped_column(String(32))
|
||||
fetched_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
||||
|
||||
|
||||
class DatasetVersion(Base):
|
||||
__tablename__ = "dataset_versions"
|
||||
__table_args__ = (
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
@@ -149,22 +151,42 @@ def parse_person_profile(
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
use_playwright: bool = False,
|
||||
resource_cache=None,
|
||||
) -> dict | None:
|
||||
normalized_url = normalize_profile_url(source_url)
|
||||
if not normalized_url:
|
||||
return None
|
||||
response = session.get(normalized_url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
profile_type, profile_id = parse_profile_identity(normalized_url)
|
||||
cache_profile_key = f"{profile_type}:{profile_id}"
|
||||
resource_manifest = []
|
||||
html = _fetch_text(
|
||||
session,
|
||||
normalized_url,
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=cache_profile_key,
|
||||
resource_key="main-html",
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
if use_playwright:
|
||||
html = _render_with_playwright(normalized_url, html)
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
profile_type, profile_id = parse_profile_identity(normalized_url)
|
||||
header = extract_person_header(soup, normalized_url)
|
||||
tabs = extract_person_tabs(soup, normalized_url)
|
||||
sections = extract_sections(soup, normalized_url)
|
||||
sections = enrich_sections_from_hse_widgets(session, soup, normalized_url, headers, timeout, sections)
|
||||
sections = enrich_sections_from_hse_widgets(
|
||||
session,
|
||||
soup,
|
||||
normalized_url,
|
||||
headers,
|
||||
timeout,
|
||||
sections,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=cache_profile_key,
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
internal_links = [tab["href"] for tab in tabs if tab.get("href")]
|
||||
|
||||
return {
|
||||
@@ -181,6 +203,7 @@ def parse_person_profile(
|
||||
"employee_internal_links": internal_links,
|
||||
"parser_version": BACKEND_VERSION,
|
||||
"_html": html,
|
||||
"_resource_manifest": resource_manifest,
|
||||
}
|
||||
|
||||
|
||||
@@ -191,13 +214,33 @@ def enrich_sections_from_hse_widgets(
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
sections: list[dict],
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_manifest: list[dict] | None = None,
|
||||
) -> list[dict]:
|
||||
enriched = list(sections)
|
||||
publications = _load_widget_publications(session, soup, headers, timeout)
|
||||
publications = _load_widget_publications(
|
||||
session,
|
||||
soup,
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
if publications:
|
||||
enriched = _upsert_publications_section(enriched, publications)
|
||||
|
||||
theses = _load_widget_graduation_theses(session, soup, source_url, headers, timeout)
|
||||
theses = _load_widget_graduation_theses(
|
||||
session,
|
||||
soup,
|
||||
source_url,
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_manifest=resource_manifest,
|
||||
)
|
||||
if theses:
|
||||
enriched = _upsert_graduation_theses_section(enriched, theses)
|
||||
return enriched
|
||||
@@ -226,7 +269,16 @@ def _render_with_playwright(source_url: str, fallback_html: str) -> str:
|
||||
return fallback_html
|
||||
|
||||
|
||||
def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: dict[str, str], timeout: int) -> list[dict]:
|
||||
def _load_widget_publications(
|
||||
session: Session,
|
||||
soup: BeautifulSoup,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
*,
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_manifest: list[dict] | None = None,
|
||||
) -> list[dict]:
|
||||
script = soup.select_one('script[data-widget-name="AuthorSearch"][data-author]')
|
||||
if not script:
|
||||
return []
|
||||
@@ -251,14 +303,29 @@ def _load_widget_publications(session: Session, soup: BeautifulSoup, headers: di
|
||||
},
|
||||
}
|
||||
try:
|
||||
response = session.post(
|
||||
"https://publications.hse.ru/api/searchPubs",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
if resource_cache and profile_key:
|
||||
text = _fetch_text(
|
||||
session,
|
||||
"https://publications.hse.ru/api/searchPubs",
|
||||
headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_key=f"publications-page-{page_id}",
|
||||
resource_manifest=resource_manifest,
|
||||
method="POST",
|
||||
json_payload=payload,
|
||||
)
|
||||
data = json.loads(text)
|
||||
else:
|
||||
response = session.post(
|
||||
"https://publications.hse.ru/api/searchPubs",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except Exception:
|
||||
return publications
|
||||
|
||||
@@ -309,6 +376,10 @@ def _load_widget_graduation_theses(
|
||||
source_url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
*,
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_manifest: list[dict] | None = None,
|
||||
) -> list[dict]:
|
||||
script = soup.select_one('script[src*="/n/stat/vkr/app.js"][data-person-id]')
|
||||
if not script:
|
||||
@@ -320,14 +391,30 @@ def _load_widget_graduation_theses(
|
||||
|
||||
request_headers = {**headers, "x-portal-language": "ru"}
|
||||
try:
|
||||
response = session.get(
|
||||
urljoin(source_url, api_url),
|
||||
params={"supervisorId": person_id},
|
||||
headers=request_headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
url = urljoin(source_url, api_url)
|
||||
params = {"supervisorId": person_id}
|
||||
if resource_cache and profile_key:
|
||||
text = _fetch_text(
|
||||
session,
|
||||
url,
|
||||
request_headers,
|
||||
timeout,
|
||||
resource_cache=resource_cache,
|
||||
profile_key=profile_key,
|
||||
resource_key="graduation-theses",
|
||||
resource_manifest=resource_manifest,
|
||||
params=params,
|
||||
)
|
||||
data = json.loads(text)
|
||||
else:
|
||||
response = session.get(
|
||||
url,
|
||||
params=params,
|
||||
headers=request_headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
@@ -629,3 +716,62 @@ def _dedupe_dicts(items: list[dict]) -> list[dict]:
|
||||
seen.add(key)
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
|
||||
def _fetch_text(
|
||||
session: Session,
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
*,
|
||||
resource_cache=None,
|
||||
profile_key: str | None = None,
|
||||
resource_key: str,
|
||||
resource_manifest: list[dict] | None,
|
||||
method: str = "GET",
|
||||
json_payload: object | None = None,
|
||||
params: dict | None = None,
|
||||
) -> str:
|
||||
if resource_cache and profile_key:
|
||||
cached = resource_cache.fetch_text(
|
||||
session,
|
||||
profile_key=profile_key,
|
||||
resource_key=resource_key,
|
||||
method=method,
|
||||
url=url,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
json_payload=json_payload,
|
||||
params=params,
|
||||
)
|
||||
if resource_manifest is not None:
|
||||
resource_manifest.append(
|
||||
{
|
||||
"resource_key": resource_key,
|
||||
"method": method,
|
||||
"url": url,
|
||||
"body_hash": cached.body_hash,
|
||||
"from_cache": cached.from_cache,
|
||||
"status_code": cached.status_code,
|
||||
}
|
||||
)
|
||||
return cached.text
|
||||
|
||||
if method.upper() == "POST":
|
||||
response = session.post(url, json=json_payload, headers=headers, timeout=timeout, params=params)
|
||||
else:
|
||||
response = session.get(url, headers=headers, timeout=timeout, params=params)
|
||||
response.raise_for_status()
|
||||
text = response.text
|
||||
if resource_manifest is not None:
|
||||
resource_manifest.append(
|
||||
{
|
||||
"resource_key": resource_key,
|
||||
"method": method,
|
||||
"url": url,
|
||||
"body_hash": hashlib.sha256(text.encode("utf-8")).hexdigest(),
|
||||
"from_cache": False,
|
||||
"status_code": response.status_code,
|
||||
}
|
||||
)
|
||||
return text
|
||||
|
||||
@@ -153,7 +153,7 @@ def stats_payload(db: Session) -> dict[str, Any]:
|
||||
def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
|
||||
if not run:
|
||||
return None
|
||||
processed = run.parsed_count + run.error_count
|
||||
processed = run.parsed_count + run.skipped_count + run.error_count
|
||||
percent = round((processed / run.found_count) * 100, 1) if run.found_count else 0
|
||||
return {
|
||||
"id": run.id,
|
||||
@@ -166,6 +166,7 @@ def run_payload(run: CrawlRun | None) -> dict[str, Any] | None:
|
||||
"finished_display": format_admin_datetime(run.finished_at),
|
||||
"found_count": run.found_count,
|
||||
"parsed_count": run.parsed_count,
|
||||
"skipped_count": run.skipped_count,
|
||||
"new_count": run.new_count,
|
||||
"error_count": run.error_count,
|
||||
"dismissed_count": run.dismissed_count,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
@@ -14,6 +15,7 @@ from app.parser.collector import collect_profile_links
|
||||
from app.parser.profile import parse_person_profile
|
||||
from app.parser.profile_url import profile_key
|
||||
from app.services.dataset_versions import get_or_create_current_version
|
||||
from app.services.resource_cache import ResourceCache
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
|
||||
@@ -29,8 +31,10 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
|
||||
|
||||
found_keys: set[str] = set()
|
||||
parsed_count = 0
|
||||
skipped_count = 0
|
||||
try:
|
||||
with requests.Session() as session:
|
||||
resource_cache = ResourceCache(db)
|
||||
urls = collect_profile_links(session, source.source_url, HEADERS, settings.request_timeout)
|
||||
if settings.crawl_limit:
|
||||
urls = urls[: settings.crawl_limit]
|
||||
@@ -48,12 +52,17 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
|
||||
HEADERS,
|
||||
settings.request_timeout,
|
||||
settings.parser_use_playwright,
|
||||
resource_cache=resource_cache,
|
||||
)
|
||||
if not parsed:
|
||||
continue
|
||||
_upsert_employee(db, run, parsed)
|
||||
parsed_count += 1
|
||||
_, changed = _upsert_employee(db, run, parsed)
|
||||
if changed:
|
||||
parsed_count += 1
|
||||
else:
|
||||
skipped_count += 1
|
||||
run.parsed_count = parsed_count
|
||||
run.skipped_count = skipped_count
|
||||
db.commit()
|
||||
except Exception as exc:
|
||||
run.error_count += 1
|
||||
@@ -69,7 +78,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
|
||||
finally:
|
||||
time.sleep(settings.request_delay_seconds)
|
||||
|
||||
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
|
||||
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
|
||||
run.status = "completed"
|
||||
get_or_create_current_version(db, crawl_run_id=run.id)
|
||||
except Exception as exc:
|
||||
@@ -90,20 +99,25 @@ def refresh_employee(db: Session, employee: Employee, settings: Settings) -> Cra
|
||||
|
||||
try:
|
||||
with requests.Session() as session:
|
||||
resource_cache = ResourceCache(db)
|
||||
parsed = parse_person_profile(
|
||||
session,
|
||||
employee.canonical_url,
|
||||
HEADERS,
|
||||
settings.request_timeout,
|
||||
settings.parser_use_playwright,
|
||||
resource_cache=resource_cache,
|
||||
)
|
||||
if not parsed:
|
||||
raise ValueError("Профиль не удалось распарсить.")
|
||||
if _parsed_profile_key(parsed) != employee.profile_key:
|
||||
raise ValueError("Распарсенный профиль не совпадает с обновляемым сотрудником.")
|
||||
|
||||
_upsert_employee(db, run, parsed)
|
||||
run.parsed_count = 1
|
||||
_, changed = _upsert_employee(db, run, parsed)
|
||||
if changed:
|
||||
run.parsed_count = 1
|
||||
else:
|
||||
run.skipped_count = 1
|
||||
run.status = "completed"
|
||||
get_or_create_current_version(db, crawl_run_id=run.id)
|
||||
except Exception as exc:
|
||||
@@ -140,8 +154,9 @@ def _parsed_profile_key(parsed: dict) -> str:
|
||||
return f"{parsed.get('profile_type')}:{parsed.get('profile_id')}"
|
||||
|
||||
|
||||
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
|
||||
def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> tuple[Employee, bool]:
|
||||
html = parsed.pop("_html", None)
|
||||
parsed.pop("_resource_manifest", None)
|
||||
checksum = _checksum(parsed)
|
||||
key = _parsed_profile_key(parsed)
|
||||
employee = db.scalar(select(Employee).where(Employee.profile_key == key))
|
||||
@@ -160,12 +175,15 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
|
||||
else:
|
||||
is_new = False
|
||||
|
||||
parser_version = parsed.get("parser_version")
|
||||
changed = is_new or employee.current_checksum != checksum or employee.parser_version != parser_version
|
||||
employee.full_name = parsed.get("full_name")
|
||||
employee.status = "active"
|
||||
employee.last_seen_at = now
|
||||
employee.dismissed_at = None
|
||||
employee.parser_version = parsed.get("parser_version")
|
||||
employee.current_data = parsed
|
||||
employee.parser_version = parser_version
|
||||
if changed:
|
||||
employee.current_data = parsed
|
||||
employee.current_checksum = checksum
|
||||
db.flush()
|
||||
|
||||
@@ -179,28 +197,29 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> Employee:
|
||||
message="Сотрудник впервые найден в источнике.",
|
||||
)
|
||||
|
||||
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
|
||||
for tab in parsed.get("tabs") or []:
|
||||
if changed:
|
||||
db.query(ProfileTab).filter(ProfileTab.employee_id == employee.id).delete()
|
||||
for tab in parsed.get("tabs") or []:
|
||||
db.add(
|
||||
ProfileTab(
|
||||
employee_id=employee.id,
|
||||
title=tab.get("title") or "",
|
||||
href=tab.get("href") or "",
|
||||
data_index=tab.get("data_index"),
|
||||
)
|
||||
)
|
||||
|
||||
db.add(
|
||||
ProfileTab(
|
||||
EmployeeSnapshot(
|
||||
employee_id=employee.id,
|
||||
title=tab.get("title") or "",
|
||||
href=tab.get("href") or "",
|
||||
data_index=tab.get("data_index"),
|
||||
crawl_run_id=run.id,
|
||||
parsed_data=parsed,
|
||||
html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
|
||||
checksum=checksum,
|
||||
parser_version=parser_version,
|
||||
)
|
||||
)
|
||||
|
||||
db.add(
|
||||
EmployeeSnapshot(
|
||||
employee_id=employee.id,
|
||||
crawl_run_id=run.id,
|
||||
parsed_data=parsed,
|
||||
html_snapshot=gzip.compress(html.encode("utf-8")) if html else None,
|
||||
checksum=checksum,
|
||||
parser_version=parsed.get("parser_version"),
|
||||
)
|
||||
)
|
||||
return employee
|
||||
return employee, changed
|
||||
|
||||
|
||||
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
|
||||
@@ -268,5 +287,23 @@ def _record_employee_change(
|
||||
|
||||
|
||||
def _checksum(data: dict) -> str:
|
||||
payload = json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
||||
payload = json.dumps(_stable_checksum_payload(data), ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _stable_checksum_payload(value):
|
||||
if isinstance(value, dict):
|
||||
return {key: _stable_checksum_payload(item) for key, item in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [_stable_checksum_payload(item) for item in value]
|
||||
if isinstance(value, str):
|
||||
return _normalize_date_dependent_experience(value)
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_date_dependent_experience(value: str) -> str:
|
||||
return re.sub(
|
||||
r"(?i)(стаж(?:\s+работы)?(?:\s+в\s+ниу\s+вшэ|\s+в\s+вшэ)?\s*:?\s*)\d+\s*(?:год(?:а|ов)?|лет)",
|
||||
r"\1<experience-years>",
|
||||
value,
|
||||
)
|
||||
|
||||
147
app/services/resource_cache.py
Normal file
147
app/services/resource_cache.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import ParseResourceCache
|
||||
from app.version import BACKEND_VERSION
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CachedResource:
|
||||
text: str
|
||||
body_hash: str
|
||||
from_cache: bool
|
||||
status_code: int
|
||||
|
||||
|
||||
class ResourceCache:
|
||||
def __init__(self, db: Session):
|
||||
self.db = db
|
||||
|
||||
def fetch_text(
|
||||
self,
|
||||
session: requests.Session,
|
||||
*,
|
||||
profile_key: str,
|
||||
resource_key: str,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
json_payload: Any | None = None,
|
||||
params: dict[str, Any] | None = None,
|
||||
) -> CachedResource:
|
||||
method = method.upper()
|
||||
fingerprint = _request_fingerprint(method=method, url=url, json_payload=json_payload, params=params)
|
||||
cached = self.db.scalar(
|
||||
select(ParseResourceCache).where(
|
||||
ParseResourceCache.profile_key == profile_key,
|
||||
ParseResourceCache.resource_key == resource_key,
|
||||
ParseResourceCache.request_fingerprint == fingerprint,
|
||||
)
|
||||
)
|
||||
|
||||
request_headers = dict(headers)
|
||||
if cached:
|
||||
if cached.etag:
|
||||
request_headers["If-None-Match"] = cached.etag
|
||||
if cached.last_modified:
|
||||
request_headers["If-Modified-Since"] = cached.last_modified
|
||||
|
||||
response = _send(
|
||||
session,
|
||||
method=method,
|
||||
url=url,
|
||||
headers=request_headers,
|
||||
timeout=timeout,
|
||||
json_payload=json_payload,
|
||||
params=params,
|
||||
)
|
||||
if response.status_code == 304 and cached:
|
||||
cached.fetched_at = datetime.now(timezone.utc)
|
||||
self.db.flush()
|
||||
return CachedResource(
|
||||
text=gzip.decompress(cached.body_snapshot).decode("utf-8"),
|
||||
body_hash=cached.body_hash,
|
||||
from_cache=True,
|
||||
status_code=response.status_code,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
text = response.text
|
||||
body_hash = _body_hash(text)
|
||||
etag = response.headers.get("ETag") if hasattr(response, "headers") else None
|
||||
last_modified = response.headers.get("Last-Modified") if hasattr(response, "headers") else None
|
||||
|
||||
if cached:
|
||||
cached.method = method
|
||||
cached.url = url
|
||||
cached.etag = etag
|
||||
cached.last_modified = last_modified
|
||||
cached.body_hash = body_hash
|
||||
cached.body_snapshot = gzip.compress(text.encode("utf-8"))
|
||||
cached.parser_version = BACKEND_VERSION
|
||||
cached.fetched_at = datetime.now(timezone.utc)
|
||||
else:
|
||||
self.db.add(
|
||||
ParseResourceCache(
|
||||
profile_key=profile_key,
|
||||
resource_key=resource_key,
|
||||
method=method,
|
||||
url=url,
|
||||
request_fingerprint=fingerprint,
|
||||
etag=etag,
|
||||
last_modified=last_modified,
|
||||
body_hash=body_hash,
|
||||
body_snapshot=gzip.compress(text.encode("utf-8")),
|
||||
parser_version=BACKEND_VERSION,
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
)
|
||||
)
|
||||
self.db.flush()
|
||||
return CachedResource(text=text, body_hash=body_hash, from_cache=False, status_code=response.status_code)
|
||||
|
||||
|
||||
def _send(
|
||||
session: requests.Session,
|
||||
*,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
json_payload: Any | None,
|
||||
params: dict[str, Any] | None,
|
||||
) -> requests.Response:
|
||||
if method == "POST":
|
||||
return session.post(url, json=json_payload, headers=headers, timeout=timeout, params=params)
|
||||
return session.get(url, headers=headers, timeout=timeout, params=params)
|
||||
|
||||
|
||||
def _request_fingerprint(
|
||||
*,
|
||||
method: str,
|
||||
url: str,
|
||||
json_payload: Any | None,
|
||||
params: dict[str, Any] | None,
|
||||
) -> str:
|
||||
payload = {
|
||||
"method": method,
|
||||
"url": url,
|
||||
"json": json_payload,
|
||||
"params": params,
|
||||
}
|
||||
encoded = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
||||
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _body_hash(text: str) -> str:
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
@@ -89,12 +89,14 @@
|
||||
const status = document.querySelector("[data-progress-status]");
|
||||
const processed = document.querySelector("[data-progress-processed]");
|
||||
const found = document.querySelector("[data-progress-found]");
|
||||
const skipped = document.querySelector("[data-progress-skipped]");
|
||||
const errors = document.querySelector("[data-progress-errors]");
|
||||
const fill = document.querySelector("[data-progress-fill]");
|
||||
const percent = document.querySelector("[data-progress-percent]");
|
||||
if (status) status.textContent = run.status_display || run.status;
|
||||
if (processed) processed.textContent = run.processed_count;
|
||||
if (found) found.textContent = run.found_count;
|
||||
if (skipped) skipped.textContent = run.skipped_count;
|
||||
if (errors) errors.textContent = run.error_count;
|
||||
if (fill) fill.style.width = `${run.progress_percent}%`;
|
||||
if (percent) percent.textContent = run.progress_percent;
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
<div class="progress-panel__meta">
|
||||
<span data-progress-status>{{ run.status_display if run else "Ожидание" }}</span>
|
||||
<span>обработано: <span data-progress-processed>{{ run.processed_count if run else 0 }}</span> / <span data-progress-found>{{ run.found_count if run else 0 }}</span></span>
|
||||
<span>без изменений: <span data-progress-skipped>{{ run.skipped_count if run else 0 }}</span></span>
|
||||
<span>ошибок: <span data-progress-errors>{{ run.error_count if run else 0 }}</span></span>
|
||||
</div>
|
||||
<div class="progress-bar" aria-label="Parsing progress">
|
||||
@@ -48,10 +49,10 @@
|
||||
<section class="panel">
|
||||
<h2 class="panel__title">Последние запуски</h2>
|
||||
<table class="table">
|
||||
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Обработано</th><th class="table__head">Ошибки</th><th class="table__head">Старт</th></tr></thead>
|
||||
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Обработано</th><th class="table__head">Без изменений</th><th class="table__head">Ошибки</th><th class="table__head">Старт</th></tr></thead>
|
||||
<tbody>
|
||||
{% for run in runs %}
|
||||
<tr class="table__row" onclick="window.location.href='/admin/runs/{{ run.id }}'" onkeydown="if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); window.location.href='/admin/runs/{{ run.id }}'; }" role="link" tabindex="0"><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
|
||||
<tr class="table__row" onclick="window.location.href='/admin/runs/{{ run.id }}'" onkeydown="if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); window.location.href='/admin/runs/{{ run.id }}'; }" role="link" tabindex="0"><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.skipped_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
<div class="stats-strip">
|
||||
<div class="stats-strip__item"><span class="stats-strip__label">Найдено</span><span class="stats-strip__value">{{ run.found_count }}</span></div>
|
||||
<div class="stats-strip__item"><span class="stats-strip__label">Обработано</span><span class="stats-strip__value">{{ run.parsed_count }}</span></div>
|
||||
<div class="stats-strip__item"><span class="stats-strip__label">Без изменений</span><span class="stats-strip__value">{{ run.skipped_count }}</span></div>
|
||||
<div class="stats-strip__item"><span class="stats-strip__label">Новые</span><span class="stats-strip__value">{{ run.new_count }}</span></div>
|
||||
<div class="stats-strip__item"><span class="stats-strip__label">Потеряшки</span><span class="stats-strip__value">{{ run.changes.missing_from_source | length }}</span></div>
|
||||
<div class="stats-strip__item"><span class="stats-strip__label">Уволены</span><span class="stats-strip__value">{{ run.dismissed_count }}</span></div>
|
||||
|
||||
@@ -8,12 +8,13 @@
|
||||
</div>
|
||||
{% set run = runs[0] if runs else none %}
|
||||
{% if run %}
|
||||
{% set processed = run.parsed_count + run.error_count %}
|
||||
{% set processed = run.parsed_count + run.skipped_count + run.error_count %}
|
||||
{% set percent = ((processed / run.found_count) * 100) | round(1) if run.found_count else 0 %}
|
||||
<div class="progress-panel" data-progress-panel>
|
||||
<div class="progress-panel__meta">
|
||||
<span data-progress-status>{{ run.status_display }}</span>
|
||||
<span>обработано: <span data-progress-processed>{{ processed }}</span> / <span data-progress-found>{{ run.found_count }}</span></span>
|
||||
<span>без изменений: <span data-progress-skipped>{{ run.skipped_count }}</span></span>
|
||||
<span>ошибок: <span data-progress-errors>{{ run.error_count }}</span></span>
|
||||
</div>
|
||||
<div class="progress-bar" aria-label="Parsing progress">
|
||||
@@ -26,6 +27,7 @@
|
||||
<div class="progress-panel__meta">
|
||||
<span data-progress-status>Ожидание</span>
|
||||
<span>обработано: <span data-progress-processed>0</span> / <span data-progress-found>0</span></span>
|
||||
<span>без изменений: <span data-progress-skipped>0</span></span>
|
||||
<span>ошибок: <span data-progress-errors>0</span></span>
|
||||
</div>
|
||||
<div class="progress-bar" aria-label="Parsing progress">
|
||||
@@ -35,10 +37,10 @@
|
||||
</div>
|
||||
{% endif %}
|
||||
<table class="table">
|
||||
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Найдено</th><th class="table__head">Обработано</th><th class="table__head">Новые</th><th class="table__head">Ошибки</th><th class="table__head">Уволены</th><th class="table__head">Старт</th></tr></thead>
|
||||
<thead><tr><th class="table__head">ID</th><th class="table__head">Статус</th><th class="table__head">Найдено</th><th class="table__head">Обработано</th><th class="table__head">Без изменений</th><th class="table__head">Новые</th><th class="table__head">Ошибки</th><th class="table__head">Уволены</th><th class="table__head">Старт</th></tr></thead>
|
||||
<tbody>
|
||||
{% for run in runs %}
|
||||
<tr class="table__row" onclick="window.location.href='/admin/runs/{{ run.id }}'" onkeydown="if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); window.location.href='/admin/runs/{{ run.id }}'; }" role="link" tabindex="0"><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
|
||||
<tr class="table__row" onclick="window.location.href='/admin/runs/{{ run.id }}'" onkeydown="if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); window.location.href='/admin/runs/{{ run.id }}'; }" role="link" tabindex="0"><td class="table__cell">{{ run.id }}</td><td class="table__cell">{{ run.status_display }}</td><td class="table__cell">{{ run.found_count }}</td><td class="table__cell">{{ run.parsed_count }}</td><td class="table__cell">{{ run.skipped_count }}</td><td class="table__cell">{{ run.new_count }}</td><td class="table__cell">{{ run.error_count }}</td><td class="table__cell">{{ run.dismissed_count }}</td><td class="table__cell">{{ run.started_display }}</td></tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
APP_VERSION = "0.5.0"
|
||||
FRONTEND_VERSION = "0.5.0"
|
||||
BACKEND_VERSION = "0.5.0"
|
||||
APP_VERSION = "0.6.0"
|
||||
FRONTEND_VERSION = "0.6.0"
|
||||
BACKEND_VERSION = "0.6.0"
|
||||
|
||||
@@ -17,7 +17,14 @@ def crawl_once() -> None:
|
||||
settings = get_settings()
|
||||
with SessionLocal() as db:
|
||||
run = run_crawl(db, settings)
|
||||
logger.info("crawl finished: id=%s status=%s parsed=%s errors=%s", run.id, run.status, run.parsed_count, run.error_count)
|
||||
logger.info(
|
||||
"crawl finished: id=%s status=%s parsed=%s skipped=%s errors=%s",
|
||||
run.id,
|
||||
run.status,
|
||||
run.parsed_count,
|
||||
run.skipped_count,
|
||||
run.error_count,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
||||
Reference in New Issue
Block a user