feature: add MIEM employees parser service with admin UI and MCP
This commit is contained in:
19
app/parser/collector.py
Normal file
19
app/parser/collector.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import Session
|
||||
|
||||
from app.parser.profile_url import normalize_profile_url
|
||||
|
||||
|
||||
def collect_profile_links(session: Session, source_url: str, headers: dict[str, str], timeout: int) -> list[str]:
|
||||
response = session.get(source_url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
seen: set[str] = set()
|
||||
collected: list[str] = []
|
||||
for anchor in soup.find_all("a", href=True):
|
||||
normalized = normalize_profile_url(anchor["href"])
|
||||
if normalized and normalized not in seen:
|
||||
seen.add(normalized)
|
||||
collected.append(normalized)
|
||||
return collected
|
||||
Reference in New Issue
Block a user