feat: add dataset checkpoint sync for MCP
This commit is contained in:
41
app/mcp.py
41
app/mcp.py
@@ -7,12 +7,31 @@ from sqlalchemy.orm import Session
|
||||
from app.db import get_db
|
||||
from app.models import CrawlRun, Employee
|
||||
from app.services.admin_data import run_detail_payload
|
||||
from app.services.dataset_versions import service_info_payload, sync_employees_payload
|
||||
from app.version import BACKEND_VERSION
|
||||
|
||||
router = APIRouter(prefix="/mcp")
|
||||
PROTOCOL_VERSION = "2024-11-05"
|
||||
SERVICE_NAME = "miem-employees"
|
||||
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"name": "get_service_info",
|
||||
"description": "Return service metadata, supported tools, and current dataset version.",
|
||||
"inputSchema": {"type": "object", "properties": {}},
|
||||
},
|
||||
{
|
||||
"name": "sync_employees",
|
||||
"description": "Synchronize employees by dataset hash. Returns a full snapshot or a delta from client_hash.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"client_hash": {"type": "string"},
|
||||
"include_data": {"type": "boolean", "default": True},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "search_employees",
|
||||
"description": "Search MIEM employees by name or profile URL.",
|
||||
@@ -71,8 +90,8 @@ async def mcp_http(
|
||||
try:
|
||||
if method == "initialize":
|
||||
result = {
|
||||
"protocolVersion": "2024-11-05",
|
||||
"serverInfo": {"name": "miem-employees", "version": BACKEND_VERSION},
|
||||
"protocolVersion": PROTOCOL_VERSION,
|
||||
"serverInfo": {"name": SERVICE_NAME, "version": BACKEND_VERSION},
|
||||
"capabilities": {"tools": {}},
|
||||
}
|
||||
elif method == "tools/list":
|
||||
@@ -87,6 +106,24 @@ async def mcp_http(
|
||||
|
||||
|
||||
def _call_tool(db: Session, name: str, arguments: dict) -> dict:
|
||||
if name == "get_service_info":
|
||||
return _tool_response(
|
||||
service_info_payload(
|
||||
db,
|
||||
tools=TOOLS,
|
||||
service_name=SERVICE_NAME,
|
||||
backend_version=BACKEND_VERSION,
|
||||
protocol_version=PROTOCOL_VERSION,
|
||||
)
|
||||
)
|
||||
if name == "sync_employees":
|
||||
return _tool_response(
|
||||
sync_employees_payload(
|
||||
db,
|
||||
client_hash=arguments.get("client_hash"),
|
||||
include_data=bool(arguments.get("include_data", True)),
|
||||
)
|
||||
)
|
||||
if name == "search_employees":
|
||||
return _tool_response(_search_employees(db, arguments))
|
||||
if name == "get_employee":
|
||||
|
||||
@@ -76,6 +76,7 @@ class CrawlRun(Base):
|
||||
message: Mapped[str | None] = mapped_column(Text)
|
||||
|
||||
employee_changes: Mapped[list["CrawlRunEmployeeChange"]] = relationship(back_populates="crawl_run")
|
||||
dataset_versions: Mapped[list["DatasetVersion"]] = relationship(back_populates="crawl_run")
|
||||
|
||||
|
||||
class CrawlRunEmployeeChange(Base):
|
||||
@@ -134,3 +135,42 @@ class ParserSource(Base):
|
||||
source_url: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
enabled: Mapped[bool] = mapped_column(default=True, nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
||||
|
||||
|
||||
class DatasetVersion(Base):
|
||||
__tablename__ = "dataset_versions"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("hash", name="uq_dataset_versions_hash"),
|
||||
Index("ix_dataset_versions_created_at", "created_at"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
previous_hash: Mapped[str | None] = mapped_column(String(64))
|
||||
crawl_run_id: Mapped[int | None] = mapped_column(ForeignKey("crawl_runs.id"))
|
||||
employee_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
active_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
dismissed_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, nullable=False)
|
||||
|
||||
crawl_run: Mapped[CrawlRun | None] = relationship(back_populates="dataset_versions")
|
||||
items: Mapped[list["DatasetVersionItem"]] = relationship(back_populates="dataset_version", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
class DatasetVersionItem(Base):
|
||||
__tablename__ = "dataset_version_items"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("dataset_version_id", "profile_key", name="uq_dataset_version_items_version_profile"),
|
||||
Index("ix_dataset_version_items_hash", "dataset_version_id"),
|
||||
Index("ix_dataset_version_items_profile_key", "profile_key"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
dataset_version_id: Mapped[int] = mapped_column(ForeignKey("dataset_versions.id"), nullable=False)
|
||||
profile_key: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
employee_id: Mapped[int | None] = mapped_column(ForeignKey("employees.id"))
|
||||
status: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
checksum: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
|
||||
dataset_version: Mapped[DatasetVersion] = relationship(back_populates="items")
|
||||
employee: Mapped[Employee | None] = relationship()
|
||||
|
||||
@@ -13,6 +13,7 @@ from app.models import CrawlError, CrawlRun, CrawlRunEmployeeChange, Employee, E
|
||||
from app.parser.collector import collect_profile_links
|
||||
from app.parser.profile import parse_person_profile
|
||||
from app.parser.profile_url import profile_key
|
||||
from app.services.dataset_versions import get_or_create_current_version
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; MIEMEmployeesBot/0.1.0; +https://miem.hse.ru/)"
|
||||
@@ -70,6 +71,7 @@ def run_crawl(db: Session, settings: Settings) -> CrawlRun:
|
||||
|
||||
run.dismissed_count = _mark_dismissed(db, run, found_keys, session, settings.request_timeout)
|
||||
run.status = "completed"
|
||||
get_or_create_current_version(db, crawl_run_id=run.id)
|
||||
except Exception as exc:
|
||||
run.status = "failed"
|
||||
run.message = str(exc)
|
||||
@@ -103,6 +105,7 @@ def refresh_employee(db: Session, employee: Employee, settings: Settings) -> Cra
|
||||
_upsert_employee(db, run, parsed)
|
||||
run.parsed_count = 1
|
||||
run.status = "completed"
|
||||
get_or_create_current_version(db, crawl_run_id=run.id)
|
||||
except Exception as exc:
|
||||
run.status = "failed"
|
||||
run.error_count = 1
|
||||
|
||||
227
app/services/dataset_versions.py
Normal file
227
app/services/dataset_versions.py
Normal file
@@ -0,0 +1,227 @@
|
||||
import hashlib
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
|
||||
from sqlalchemy import desc, select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import DatasetVersion, DatasetVersionItem, Employee
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EmployeeMarker:
|
||||
profile_key: str
|
||||
employee_id: int | None
|
||||
status: str
|
||||
checksum: str
|
||||
|
||||
|
||||
def get_or_create_current_version(db: Session, *, crawl_run_id: int | None = None) -> DatasetVersion:
|
||||
employees = db.scalars(select(Employee).order_by(Employee.profile_key)).all()
|
||||
markers = [_employee_marker(employee) for employee in employees]
|
||||
dataset_hash = _dataset_hash(markers)
|
||||
latest = get_latest_version(db)
|
||||
if latest and latest.hash == dataset_hash:
|
||||
return latest
|
||||
|
||||
active_count = sum(1 for marker in markers if marker.status == "active")
|
||||
dismissed_count = sum(1 for marker in markers if marker.status == "dismissed")
|
||||
version = DatasetVersion(
|
||||
hash=dataset_hash,
|
||||
previous_hash=latest.hash if latest else None,
|
||||
crawl_run_id=crawl_run_id,
|
||||
employee_count=len(markers),
|
||||
active_count=active_count,
|
||||
dismissed_count=dismissed_count,
|
||||
)
|
||||
db.add(version)
|
||||
db.flush()
|
||||
for marker in markers:
|
||||
db.add(
|
||||
DatasetVersionItem(
|
||||
dataset_version_id=version.id,
|
||||
profile_key=marker.profile_key,
|
||||
employee_id=marker.employee_id,
|
||||
status=marker.status,
|
||||
checksum=marker.checksum,
|
||||
)
|
||||
)
|
||||
db.flush()
|
||||
return version
|
||||
|
||||
|
||||
def get_latest_version(db: Session) -> DatasetVersion | None:
|
||||
return db.scalar(select(DatasetVersion).order_by(desc(DatasetVersion.created_at), desc(DatasetVersion.id)).limit(1))
|
||||
|
||||
|
||||
def get_version_by_hash(db: Session, dataset_hash: str | None) -> DatasetVersion | None:
|
||||
if not dataset_hash:
|
||||
return None
|
||||
return db.scalar(select(DatasetVersion).where(DatasetVersion.hash == dataset_hash).limit(1))
|
||||
|
||||
|
||||
def service_info_payload(db: Session, *, tools: list[dict], service_name: str, backend_version: str, protocol_version: str) -> dict:
|
||||
version = get_or_create_current_version(db)
|
||||
db.commit()
|
||||
return {
|
||||
"service_name": service_name,
|
||||
"backend_version": backend_version,
|
||||
"protocolVersion": protocol_version,
|
||||
"tools": tools,
|
||||
"dataset": _version_payload(version),
|
||||
}
|
||||
|
||||
|
||||
def sync_employees_payload(db: Session, *, client_hash: str | None = None, include_data: bool = True) -> dict:
|
||||
current = get_or_create_current_version(db)
|
||||
db.commit()
|
||||
if not client_hash:
|
||||
return _full_sync_payload(db, current, include_data=include_data, reason=None)
|
||||
if client_hash == current.hash:
|
||||
return {
|
||||
"mode": "delta",
|
||||
"from_hash": client_hash,
|
||||
"to_hash": current.hash,
|
||||
"dataset": _version_payload(current),
|
||||
"changes": {"added": [], "updated": [], "dismissed": [], "removed": []},
|
||||
}
|
||||
|
||||
previous = get_version_by_hash(db, client_hash)
|
||||
if not previous:
|
||||
return _full_sync_payload(db, current, include_data=include_data, reason="unknown_client_hash", from_hash=client_hash)
|
||||
|
||||
return _delta_sync_payload(db, previous, current, include_data=include_data)
|
||||
|
||||
|
||||
def _full_sync_payload(
|
||||
db: Session,
|
||||
current: DatasetVersion,
|
||||
*,
|
||||
include_data: bool,
|
||||
reason: str | None,
|
||||
from_hash: str | None = None,
|
||||
) -> dict:
|
||||
employees = db.scalars(select(Employee).order_by(Employee.profile_key)).all()
|
||||
payload = {
|
||||
"mode": "full",
|
||||
"from_hash": from_hash,
|
||||
"to_hash": current.hash,
|
||||
"dataset": _version_payload(current),
|
||||
"items": [_employee_payload(employee, include_data=include_data) for employee in employees],
|
||||
}
|
||||
if reason:
|
||||
payload["reason"] = reason
|
||||
return payload
|
||||
|
||||
|
||||
def _delta_sync_payload(db: Session, previous: DatasetVersion, current: DatasetVersion, *, include_data: bool) -> dict:
|
||||
previous_items = _items_by_profile_key(previous)
|
||||
current_items = _items_by_profile_key(current)
|
||||
employees = {employee.profile_key: employee for employee in db.scalars(select(Employee)).all()}
|
||||
added = []
|
||||
updated = []
|
||||
dismissed = []
|
||||
removed = []
|
||||
|
||||
for profile_key, current_item in sorted(current_items.items()):
|
||||
previous_item = previous_items.get(profile_key)
|
||||
employee = employees.get(profile_key)
|
||||
if not previous_item:
|
||||
if employee:
|
||||
added.append(_employee_payload(employee, include_data=include_data))
|
||||
continue
|
||||
if previous_item.checksum == current_item.checksum and previous_item.status == current_item.status:
|
||||
continue
|
||||
if current_item.status == "dismissed":
|
||||
dismissed.append(_tombstone(profile_key, current_item.status, employee))
|
||||
elif employee:
|
||||
updated.append(_employee_payload(employee, include_data=include_data))
|
||||
|
||||
for profile_key, previous_item in sorted(previous_items.items()):
|
||||
if profile_key not in current_items:
|
||||
removed.append(_tombstone(profile_key, "removed", employees.get(profile_key), checksum=previous_item.checksum))
|
||||
|
||||
return {
|
||||
"mode": "delta",
|
||||
"from_hash": previous.hash,
|
||||
"to_hash": current.hash,
|
||||
"dataset": _version_payload(current),
|
||||
"changes": {
|
||||
"added": added,
|
||||
"updated": updated,
|
||||
"dismissed": dismissed,
|
||||
"removed": removed,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _items_by_profile_key(version: DatasetVersion) -> dict[str, DatasetVersionItem]:
|
||||
return {item.profile_key: item for item in version.items}
|
||||
|
||||
|
||||
def _version_payload(version: DatasetVersion) -> dict:
|
||||
return {
|
||||
"hash": version.hash,
|
||||
"previous_hash": version.previous_hash,
|
||||
"created_at": version.created_at.isoformat() if version.created_at else None,
|
||||
"crawl_run_id": version.crawl_run_id,
|
||||
"employee_count": version.employee_count,
|
||||
"active_count": version.active_count,
|
||||
"dismissed_count": version.dismissed_count,
|
||||
}
|
||||
|
||||
|
||||
def _employee_marker(employee: Employee) -> EmployeeMarker:
|
||||
return EmployeeMarker(
|
||||
profile_key=employee.profile_key,
|
||||
employee_id=employee.id,
|
||||
status=employee.status,
|
||||
checksum=employee.current_checksum or _payload_hash(employee.current_data or {}),
|
||||
)
|
||||
|
||||
|
||||
def _dataset_hash(markers: list[EmployeeMarker]) -> str:
|
||||
payload = [
|
||||
{"profile_key": marker.profile_key, "status": marker.status, "checksum": marker.checksum}
|
||||
for marker in sorted(markers, key=lambda item: item.profile_key)
|
||||
]
|
||||
return _payload_hash(payload)
|
||||
|
||||
|
||||
def _payload_hash(value: object) -> str:
|
||||
payload = json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str)
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _employee_payload(employee: Employee, *, include_data: bool) -> dict:
|
||||
payload = {
|
||||
"profile_key": employee.profile_key,
|
||||
"profile_id": employee.profile_id,
|
||||
"full_name": employee.full_name,
|
||||
"status": employee.status,
|
||||
"canonical_url": employee.canonical_url,
|
||||
"last_seen_at": employee.last_seen_at.isoformat() if employee.last_seen_at else None,
|
||||
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
|
||||
"checksum": employee.current_checksum or _payload_hash(employee.current_data or {}),
|
||||
}
|
||||
if include_data:
|
||||
payload["data"] = employee.current_data
|
||||
return payload
|
||||
|
||||
|
||||
def _tombstone(profile_key: str, status: str, employee: Employee | None, *, checksum: str | None = None) -> dict:
|
||||
payload = {
|
||||
"profile_key": profile_key,
|
||||
"status": status,
|
||||
"checksum": checksum or (employee.current_checksum if employee else None),
|
||||
}
|
||||
if employee:
|
||||
payload.update(
|
||||
{
|
||||
"profile_id": employee.profile_id,
|
||||
"full_name": employee.full_name,
|
||||
"canonical_url": employee.canonical_url,
|
||||
"dismissed_at": employee.dismissed_at.isoformat() if employee.dismissed_at else None,
|
||||
}
|
||||
)
|
||||
return payload
|
||||
@@ -1,3 +1,3 @@
|
||||
APP_VERSION = "0.4.7"
|
||||
FRONTEND_VERSION = "0.4.7"
|
||||
BACKEND_VERSION = "0.4.7"
|
||||
APP_VERSION = "0.5.0"
|
||||
FRONTEND_VERSION = "0.5.0"
|
||||
BACKEND_VERSION = "0.5.0"
|
||||
|
||||
Reference in New Issue
Block a user