Files
miem_workers/app/parser/collector.py

20 lines
704 B
Python

from bs4 import BeautifulSoup
from requests import Session
from app.parser.profile_url import normalize_profile_url
def collect_profile_links(session: Session, source_url: str, headers: dict[str, str], timeout: int) -> list[str]:
response = session.get(source_url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen: set[str] = set()
collected: list[str] = []
for anchor in soup.find_all("a", href=True):
normalized = normalize_profile_url(anchor["href"])
if normalized and normalized not in seen:
seen.add(normalized)
collected.append(normalized)
return collected