20 lines
704 B
Python
20 lines
704 B
Python
from bs4 import BeautifulSoup
|
|
from requests import Session
|
|
|
|
from app.parser.profile_url import normalize_profile_url
|
|
|
|
|
|
def collect_profile_links(session: Session, source_url: str, headers: dict[str, str], timeout: int) -> list[str]:
|
|
response = session.get(source_url, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
seen: set[str] = set()
|
|
collected: list[str] = []
|
|
for anchor in soup.find_all("a", href=True):
|
|
normalized = normalize_profile_url(anchor["href"])
|
|
if normalized and normalized not in seen:
|
|
seen.add(normalized)
|
|
collected.append(normalized)
|
|
return collected
|