feature: add MIEM employees parser service with admin UI and MCP

This commit is contained in:
Anton
2026-04-28 16:20:51 +03:00
parent 6480f31e8f
commit d512580960
29 changed files with 1883 additions and 0 deletions

23
tests/conftest.py Normal file
View File

@@ -0,0 +1,23 @@
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool
from app.db import Base
@pytest.fixture()
def db_session():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
try:
yield session
finally:
session.close()
Base.metadata.drop_all(engine)

107
tests/test_api_mcp.py Normal file
View File

@@ -0,0 +1,107 @@
from datetime import datetime, timezone
from fastapi.testclient import TestClient
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool
from app.config import Settings, get_settings
from app.db import Base, get_db
from app.main import app
from app.models import Employee
def test_health_returns_versions():
client = TestClient(app)
response = client.get("/api/health")
assert response.status_code == 200
assert response.json()["backend_version"] == "0.1.0"
def test_mcp_requires_token_and_lists_tools():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def override_db():
session = Session()
try:
yield session
finally:
session.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
client = TestClient(app)
unauthorized = client.post("/mcp", json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})
authorized = client.post(
"/mcp",
headers={"Authorization": "Bearer secret"},
json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
)
assert unauthorized.status_code == 401
assert authorized.status_code == 200
assert authorized.json()["result"]["tools"][0]["name"] == "search_employees"
app.dependency_overrides.clear()
def test_mcp_search_employees_returns_matching_employee():
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
session.add(
Employee(
profile_key="staff:avsergeev",
profile_type="staff",
profile_id="avsergeev",
canonical_url="https://www.hse.ru/staff/avsergeev",
full_name="Сергеев Алексей Викторович",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
current_data={"sections": []},
)
)
session.commit()
session.close()
def override_db():
db = Session()
try:
yield db
finally:
db.close()
app.dependency_overrides[get_db] = override_db
app.dependency_overrides[get_settings] = lambda: Settings(mcp_token="secret", session_secret="session-secret")
client = TestClient(app)
response = client.post(
"/mcp",
headers={"Authorization": "Bearer secret"},
json={
"jsonrpc": "2.0",
"id": 1,
"method": "tools/call",
"params": {"name": "search_employees", "arguments": {"query": "Сергеев"}},
},
)
assert response.status_code == 200
assert "Сергеев Алексей Викторович" in response.json()["result"]["content"][0]["text"]
app.dependency_overrides.clear()

34
tests/test_crawler.py Normal file
View File

@@ -0,0 +1,34 @@
from datetime import datetime, timezone
from app.models import Employee
from app.services.crawler import _mark_dismissed
def test_mark_dismissed_only_marks_missing_active_employees(db_session):
db_session.add(
Employee(
profile_key="staff:kept",
canonical_url="https://www.hse.ru/staff/kept",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.add(
Employee(
profile_key="staff:gone",
canonical_url="https://www.hse.ru/staff/gone",
status="active",
first_seen_at=datetime.now(timezone.utc),
last_seen_at=datetime.now(timezone.utc),
)
)
db_session.commit()
dismissed = _mark_dismissed(db_session, {"staff:kept"})
assert dismissed == 1
assert db_session.query(Employee).filter_by(profile_key="staff:kept").one().status == "active"
gone = db_session.query(Employee).filter_by(profile_key="staff:gone").one()
assert gone.status == "dismissed"
assert gone.dismissed_at is not None

28
tests/test_parser.py Normal file
View File

@@ -0,0 +1,28 @@
from bs4 import BeautifulSoup
from app.parser.profile import extract_person_tabs
from app.parser.profile_url import normalize_profile_url, parse_profile_identity
def test_normalize_profile_url_supports_staff_and_org_persons():
assert normalize_profile_url("/staff/avsergeev#sci") == "https://www.hse.ru/staff/avsergeev"
assert normalize_profile_url("https://www.hse.ru/org/persons/123/") == "https://www.hse.ru/org/persons/123"
assert parse_profile_identity("https://www.hse.ru/staff/avsergeev") == ("staff", "avsergeev")
def test_extract_person_tabs_prefers_person_menu_addition():
soup = BeautifulSoup(
"""
<div class="person-menu is-desktop small person-menu-addition">
<a href="#main">Домашняя страница</a>
<a href="#sci" data-index="1">Публикации</a>
</div>
<a href="/org/persons/999">Other person</a>
""",
"html.parser",
)
tabs = extract_person_tabs(soup, "https://www.hse.ru/staff/avsergeev")
assert [tab["title"] for tab in tabs] == ["Домашняя страница", "Публикации"]
assert tabs[1]["href"] == "https://www.hse.ru/staff/avsergeev#sci"