feat: add employee news links parsing and storage
This commit is contained in:
@@ -15,6 +15,7 @@ from app.models import (
|
||||
CrawlRun,
|
||||
CrawlRunEmployeeChange,
|
||||
Employee,
|
||||
EmployeeNewsLink,
|
||||
EmployeePublication,
|
||||
EmployeeSnapshot,
|
||||
ParserSource,
|
||||
@@ -230,6 +231,7 @@ def _upsert_employee(db: Session, run: CrawlRun, parsed: dict) -> tuple[Employee
|
||||
)
|
||||
db.flush()
|
||||
_try_sync_employee_publications(db, run, employee, parsed)
|
||||
_try_sync_employee_news_links(db, run, employee, parsed)
|
||||
return employee, changed
|
||||
|
||||
|
||||
@@ -349,6 +351,108 @@ def _int_or_none(value: object) -> int | None:
|
||||
return None
|
||||
|
||||
|
||||
def _try_sync_employee_news_links(db: Session, run: CrawlRun, employee: Employee, parsed: dict) -> None:
|
||||
try:
|
||||
if not _news_link_payloads(parsed):
|
||||
return
|
||||
if not _employee_news_links_table_exists(db):
|
||||
return
|
||||
with db.begin_nested():
|
||||
_sync_employee_news_links(db, employee, parsed)
|
||||
except Exception as exc:
|
||||
db.add(
|
||||
CrawlError(
|
||||
crawl_run_id=run.id,
|
||||
profile_url=employee.canonical_url,
|
||||
error_type=type(exc).__name__,
|
||||
message=f"Не удалось сохранить новости сотрудника: {exc}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _employee_news_links_table_exists(db: Session) -> bool:
|
||||
return inspect(db.connection()).has_table(EmployeeNewsLink.__tablename__)
|
||||
|
||||
|
||||
def _sync_employee_news_links(db: Session, employee: Employee, parsed: dict) -> None:
|
||||
news_links = _news_link_payloads(parsed)
|
||||
seen_hashes = set()
|
||||
for news_link in news_links:
|
||||
source_hash = _news_link_hash(news_link)
|
||||
seen_hashes.add(source_hash)
|
||||
url = _clean_optional(news_link.get("url"))
|
||||
existing = None
|
||||
if url:
|
||||
existing = db.scalar(
|
||||
select(EmployeeNewsLink).where(
|
||||
EmployeeNewsLink.employee_id == employee.id,
|
||||
EmployeeNewsLink.url == url,
|
||||
)
|
||||
)
|
||||
if not existing:
|
||||
existing = db.scalar(
|
||||
select(EmployeeNewsLink).where(
|
||||
EmployeeNewsLink.employee_id == employee.id,
|
||||
EmployeeNewsLink.source_hash == source_hash,
|
||||
)
|
||||
)
|
||||
if not existing:
|
||||
existing = EmployeeNewsLink(employee_id=employee.id, source_hash=source_hash, title=_news_link_title(news_link))
|
||||
db.add(existing)
|
||||
_apply_news_link(existing, news_link, source_hash)
|
||||
|
||||
if seen_hashes:
|
||||
stale = db.scalars(
|
||||
select(EmployeeNewsLink).where(
|
||||
EmployeeNewsLink.employee_id == employee.id,
|
||||
EmployeeNewsLink.source_hash.not_in(seen_hashes),
|
||||
)
|
||||
).all()
|
||||
for item in stale:
|
||||
db.delete(item)
|
||||
|
||||
|
||||
def _news_link_payloads(parsed: dict) -> list[dict]:
|
||||
news_links = []
|
||||
for section in parsed.get("sections") or []:
|
||||
if not isinstance(section, dict) or section.get("type") != "news":
|
||||
continue
|
||||
for item in section.get("news_links") or []:
|
||||
if isinstance(item, dict):
|
||||
news_links.append(item)
|
||||
return news_links
|
||||
|
||||
|
||||
def _apply_news_link(target: EmployeeNewsLink, news_link: dict, source_hash: str) -> None:
|
||||
target.title = _news_link_title(news_link)
|
||||
target.url = _clean_optional(news_link.get("url"))
|
||||
target.summary = _clean_optional(news_link.get("summary"))
|
||||
target.published_at = _datetime_or_none(news_link.get("published_at"))
|
||||
target.published_year = _int_or_none(news_link.get("published_year"))
|
||||
target.raw_data = news_link.get("raw_data") if isinstance(news_link.get("raw_data"), dict) else news_link
|
||||
target.source_hash = source_hash
|
||||
|
||||
|
||||
def _news_link_hash(news_link: dict) -> str:
|
||||
return _payload_hash(news_link.get("raw_data") if isinstance(news_link.get("raw_data"), dict) else news_link)
|
||||
|
||||
|
||||
def _news_link_title(news_link: dict) -> str:
|
||||
return _clean_optional(news_link.get("title") or news_link.get("url")) or "Untitled news"
|
||||
|
||||
|
||||
def _datetime_or_none(value: object) -> datetime | None:
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
parsed = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
return parsed if parsed.tzinfo else parsed.replace(tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def _mark_dismissed(db: Session, run: CrawlRun, found_keys: set[str], session: requests.Session, timeout: int) -> int:
|
||||
dismissed = 0
|
||||
active = db.scalars(select(Employee).where(Employee.status == "active")).all()
|
||||
|
||||
Reference in New Issue
Block a user