From 1bbe4904a7eed6f3c0553bafa5e2d1551f45613d Mon Sep 17 00:00:00 2001 From: twothatit Date: Thu, 19 Feb 2026 00:30:25 +0100 Subject: [PATCH] fix: resolve circular import, async blocking, SELinux and delete timeout issues - Extract shared SlowAPI limiter to app/limiter.py to break circular import between app.main and app.routers.auth - Seed default SystemConfig row (id=1) on first DB init so settings page works out of the box - Make all docker_service.compose_* functions async (run_in_executor) so long docker pulls/stops no longer block the async event loop - Propagate async to netbird_service stop/start/restart and await callers in deployments router - Move customer delete to BackgroundTasks so the HTTP response returns immediately and avoids frontend "Network error" on slow machines - docker-compose: add :z SELinux labels, mount docker.sock directly, add security_opt label:disable for socket access, extra_hosts for host.docker.internal, enable DELETE/VOLUMES on socket proxy - npm_service: auto-detect outbound host IP via UDP socket when HOST_IP env var is not set Co-Authored-By: Claude Sonnet 4.6 --- app/database.py | 16 ++++++++++++++++ app/limiter.py | 5 +++++ app/main.py | 9 ++------- app/routers/auth.py | 3 +-- app/routers/customers.py | 30 ++++++++++++++++++++---------- app/routers/deployments.py | 6 +++--- app/services/docker_service.py | 30 ++++++++++++++++++++---------- app/services/netbird_service.py | 20 ++++++++++---------- app/services/npm_service.py | 13 ++++++++++++- docker-compose.yml | 23 +++++++++++++---------- 10 files changed, 102 insertions(+), 53 deletions(-) create mode 100644 app/limiter.py diff --git a/app/database.py b/app/database.py index d25cd37..f630cc0 100644 --- a/app/database.py +++ b/app/database.py @@ -51,6 +51,22 @@ def init_db() -> None: Base.metadata.create_all(bind=engine) _run_migrations() + # Insert default SystemConfig row (id=1) if it doesn't exist yet + db = SessionLocal() + try: + if not db.query(SystemConfig).filter(SystemConfig.id == 1).first(): + db.add(SystemConfig( + id=1, + base_domain="example.com", + admin_email="admin@example.com", + npm_api_url="http://localhost:81", + npm_api_email_encrypted="", + npm_api_password_encrypted="", + )) + db.commit() + finally: + db.close() + def _run_migrations() -> None: """Add columns that may be missing from older database versions.""" diff --git a/app/limiter.py b/app/limiter.py new file mode 100644 index 0000000..ae8efa9 --- /dev/null +++ b/app/limiter.py @@ -0,0 +1,5 @@ +"""Shared rate limiter instance.""" +from slowapi import Limiter +from slowapi.util import get_remote_address + +limiter = Limiter(key_func=get_remote_address) diff --git a/app/main.py b/app/main.py index 901c80c..acf6e93 100644 --- a/app/main.py +++ b/app/main.py @@ -7,11 +7,11 @@ from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from fastapi.staticfiles import StaticFiles -from slowapi import Limiter, _rate_limit_exceeded_handler +from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded -from slowapi.util import get_remote_address from app.database import init_db +from app.limiter import limiter from app.routers import auth, customers, deployments, monitoring, settings, users # --------------------------------------------------------------------------- @@ -27,11 +27,6 @@ logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Application # --------------------------------------------------------------------------- -# --------------------------------------------------------------------------- -# Rate limiter (SlowAPI) -# --------------------------------------------------------------------------- -limiter = Limiter(key_func=get_remote_address) - # --------------------------------------------------------------------------- # Application # --------------------------------------------------------------------------- diff --git a/app/routers/auth.py b/app/routers/auth.py index cd7e107..abd72b5 100644 --- a/app/routers/auth.py +++ b/app/routers/auth.py @@ -27,8 +27,7 @@ from app.utils.validators import ChangePasswordRequest, LoginRequest, MfaTokenRe logger = logging.getLogger(__name__) router = APIRouter() -# Import the shared rate limiter from main -from app.main import limiter +from app.limiter import limiter @router.post("/login") diff --git a/app/routers/customers.py b/app/routers/customers.py index 27685c2..0f1280b 100644 --- a/app/routers/customers.py +++ b/app/routers/customers.py @@ -211,12 +211,14 @@ async def update_customer( @router.delete("/{customer_id}") async def delete_customer( customer_id: int, + background_tasks: BackgroundTasks, current_user: User = Depends(get_current_user), db: Session = Depends(get_db), ): """Delete a customer and clean up all resources. Removes containers, NPM proxy, instance directory, and database records. + Cleanup runs in background so the response returns immediately. Args: customer_id: Customer ID. @@ -231,15 +233,23 @@ async def delete_customer( detail="Customer not found.", ) - # Undeploy first (containers, NPM, files) - try: - await netbird_service.undeploy_customer(db, customer_id) - except Exception: - logger.exception("Undeploy error for customer %d (continuing with delete)", customer_id) - - # Delete customer record (cascades to deployment + logs) - db.delete(customer) + # Mark as deleting immediately so UI reflects the state + customer.status = "inactive" db.commit() - logger.info("Customer %d deleted by %s.", customer_id, current_user.username) - return {"message": f"Customer {customer_id} deleted successfully."} + async def _delete_in_background(cid: int) -> None: + bg_db = SessionLocal() + try: + await netbird_service.undeploy_customer(bg_db, cid) + c = bg_db.query(Customer).filter(Customer.id == cid).first() + if c: + bg_db.delete(c) + bg_db.commit() + logger.info("Customer %d deleted by %s.", cid, current_user.username) + except Exception: + logger.exception("Background delete failed for customer %d", cid) + finally: + bg_db.close() + + background_tasks.add_task(_delete_in_background, customer_id) + return {"message": f"Customer {customer_id} deletion started."} diff --git a/app/routers/deployments.py b/app/routers/deployments.py index 1e5f8eb..d29ef01 100644 --- a/app/routers/deployments.py +++ b/app/routers/deployments.py @@ -72,7 +72,7 @@ async def start_customer( Result dict. """ _require_customer(db, customer_id) - result = netbird_service.start_customer(db, customer_id) + result = await netbird_service.start_customer(db, customer_id) if not result.get("success"): raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, @@ -96,7 +96,7 @@ async def stop_customer( Result dict. """ _require_customer(db, customer_id) - result = netbird_service.stop_customer(db, customer_id) + result = await netbird_service.stop_customer(db, customer_id) if not result.get("success"): raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, @@ -120,7 +120,7 @@ async def restart_customer( Result dict. """ _require_customer(db, customer_id) - result = netbird_service.restart_customer(db, customer_id) + result = await netbird_service.restart_customer(db, customer_id) if not result.get("success"): raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, diff --git a/app/services/docker_service.py b/app/services/docker_service.py index 1690575..5927514 100644 --- a/app/services/docker_service.py +++ b/app/services/docker_service.py @@ -5,6 +5,7 @@ per-customer Docker Compose stacks. Also provides log retrieval and container health/status information. """ +import asyncio import logging import os import subprocess @@ -17,6 +18,15 @@ from docker.errors import DockerException, NotFound logger = logging.getLogger(__name__) +async def _run_cmd(cmd: list[str], timeout: int = 120) -> subprocess.CompletedProcess: + """Run a subprocess command in a thread pool to avoid blocking the event loop.""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor( # type: ignore[arg-type] + None, + lambda: subprocess.run(cmd, capture_output=True, text=True, timeout=timeout), + ) + + def _get_client() -> docker.DockerClient: """Return a Docker client connected via the Unix socket. @@ -26,7 +36,7 @@ def _get_client() -> docker.DockerClient: return docker.from_env() -def compose_up( +async def compose_up( instance_dir: str, project_name: str, services: Optional[list[str]] = None, @@ -63,7 +73,7 @@ def compose_up( cmd.extend(services) logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + result = await _run_cmd(cmd, timeout=timeout) if result.returncode != 0: logger.error("docker compose up failed: %s", result.stderr) @@ -74,7 +84,7 @@ def compose_up( return True -def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool: +async def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool: """Run ``docker compose down`` for a customer instance. Args: @@ -96,14 +106,14 @@ def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = Fa cmd.append("-v") logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + result = await _run_cmd(cmd) if result.returncode != 0: logger.warning("docker compose down returned non-zero: %s", result.stderr) return True -def compose_stop(instance_dir: str, project_name: str) -> bool: +async def compose_stop(instance_dir: str, project_name: str) -> bool: """Run ``docker compose stop`` for a customer instance. Args: @@ -121,11 +131,11 @@ def compose_stop(instance_dir: str, project_name: str) -> bool: "stop", ] logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + result = await _run_cmd(cmd) return result.returncode == 0 -def compose_start(instance_dir: str, project_name: str) -> bool: +async def compose_start(instance_dir: str, project_name: str) -> bool: """Run ``docker compose start`` for a customer instance. Args: @@ -143,11 +153,11 @@ def compose_start(instance_dir: str, project_name: str) -> bool: "start", ] logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + result = await _run_cmd(cmd) return result.returncode == 0 -def compose_restart(instance_dir: str, project_name: str) -> bool: +async def compose_restart(instance_dir: str, project_name: str) -> bool: """Run ``docker compose restart`` for a customer instance. Args: @@ -165,7 +175,7 @@ def compose_restart(instance_dir: str, project_name: str) -> bool: "restart", ] logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + result = await _run_cmd(cmd) return result.returncode == 0 diff --git a/app/services/netbird_service.py b/app/services/netbird_service.py index 34bc22d..f3bef98 100644 --- a/app/services/netbird_service.py +++ b/app/services/netbird_service.py @@ -204,14 +204,14 @@ async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]: # Step 5b: Stop existing containers if re-deploying if existing_deployment: try: - docker_service.compose_down(instance_dir, container_prefix, remove_volumes=False) + await docker_service.compose_down(instance_dir, container_prefix, remove_volumes=False) _log_action(db, customer_id, "deploy", "info", "Stopped existing containers for re-deployment.") except Exception as exc: logger.warning("Could not stop existing containers: %s", exc) # Step 6: Start all Docker containers - docker_service.compose_up(instance_dir, container_prefix, timeout=120) + await docker_service.compose_up(instance_dir, container_prefix, timeout=120) _log_action(db, customer_id, "deploy", "info", "Docker containers started.") # Step 7: Wait for containers to be healthy @@ -373,7 +373,7 @@ async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]: # Rollback: stop containers if they were started try: - docker_service.compose_down( + await docker_service.compose_down( instance_dir or os.path.join(config.data_dir, f"kunde{customer_id}"), container_prefix, remove_volumes=True, @@ -414,7 +414,7 @@ async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]: # Stop and remove containers try: - docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True) + await docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True) _log_action(db, customer_id, "undeploy", "info", "Containers removed.") except Exception as exc: _log_action(db, customer_id, "undeploy", "error", f"Container removal error: {exc}") @@ -457,7 +457,7 @@ async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]: return {"success": True} -def stop_customer(db: Session, customer_id: int) -> dict[str, Any]: +async def stop_customer(db: Session, customer_id: int) -> dict[str, Any]: """Stop containers for a customer.""" deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first() config = get_system_config(db) @@ -465,7 +465,7 @@ def stop_customer(db: Session, customer_id: int) -> dict[str, Any]: return {"success": False, "error": "Deployment or config not found."} instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}") - ok = docker_service.compose_stop(instance_dir, deployment.container_prefix) + ok = await docker_service.compose_stop(instance_dir, deployment.container_prefix) if ok: deployment.deployment_status = "stopped" customer = db.query(Customer).filter(Customer.id == customer_id).first() @@ -478,7 +478,7 @@ def stop_customer(db: Session, customer_id: int) -> dict[str, Any]: return {"success": ok} -def start_customer(db: Session, customer_id: int) -> dict[str, Any]: +async def start_customer(db: Session, customer_id: int) -> dict[str, Any]: """Start containers for a customer.""" deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first() config = get_system_config(db) @@ -486,7 +486,7 @@ def start_customer(db: Session, customer_id: int) -> dict[str, Any]: return {"success": False, "error": "Deployment or config not found."} instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}") - ok = docker_service.compose_start(instance_dir, deployment.container_prefix) + ok = await docker_service.compose_start(instance_dir, deployment.container_prefix) if ok: deployment.deployment_status = "running" customer = db.query(Customer).filter(Customer.id == customer_id).first() @@ -499,7 +499,7 @@ def start_customer(db: Session, customer_id: int) -> dict[str, Any]: return {"success": ok} -def restart_customer(db: Session, customer_id: int) -> dict[str, Any]: +async def restart_customer(db: Session, customer_id: int) -> dict[str, Any]: """Restart containers for a customer.""" deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first() config = get_system_config(db) @@ -507,7 +507,7 @@ def restart_customer(db: Session, customer_id: int) -> dict[str, Any]: return {"success": False, "error": "Deployment or config not found."} instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}") - ok = docker_service.compose_restart(instance_dir, deployment.container_prefix) + ok = await docker_service.compose_restart(instance_dir, deployment.container_prefix) if ok: deployment.deployment_status = "running" customer = db.query(Customer).filter(Customer.id == customer_id).first() diff --git a/app/services/npm_service.py b/app/services/npm_service.py index 57dba6a..7ab1c91 100644 --- a/app/services/npm_service.py +++ b/app/services/npm_service.py @@ -14,6 +14,7 @@ Also manages NPM streams for STUN/TURN relay UDP ports. import logging import os +import socket from typing import Any import httpx @@ -41,7 +42,17 @@ def _get_forward_host() -> str: logger.info("Using HOST_IP from environment: %s", host_ip) return host_ip - logger.warning("HOST_IP not set in environment — please add HOST_IP= to .env") + # Auto-detect: connect to external address to find the outbound interface IP + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect(("8.8.8.8", 80)) + detected = s.getsockname()[0] + logger.info("Auto-detected host IP: %s (set HOST_IP in .env to override)", detected) + return detected + except Exception: + pass + + logger.warning("Could not detect host IP — falling back to 127.0.0.1. Set HOST_IP in .env!") return "127.0.0.1" diff --git a/docker-compose.yml b/docker-compose.yml index 9c02bc2..f5302e4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,9 @@ services: INFO: 1 # Write endpoints (needed for compose up/down/start/stop) POST: 1 + DELETE: 1 + # Volumes needed for docker compose (creates/removes volumes per customer) + VOLUMES: 1 # Explicitly deny dangerous endpoints AUTH: 0 SECRETS: 0 @@ -24,13 +27,12 @@ services: TASKS: 0 CONFIGS: 0 PLUGINS: 0 - VOLUMES: 0 BUILD: 0 COMMIT: 0 DISTRIBUTION: 0 EXEC: 1 volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/run/docker.sock:/var/run/docker.sock:ro,z networks: - npm-network # Only accessible from within the Docker network — never expose port externally @@ -39,17 +41,20 @@ services: build: . container_name: netbird-msp-appliance restart: unless-stopped + security_opt: + - label:disable + extra_hosts: + - "host.docker.internal:host-gateway" depends_on: - docker-socket-proxy ports: - "${WEB_UI_PORT:-8000}:8000" volumes: - - ./data:/app/data - - ./logs:/app/logs - - ./backups:/app/backups - # NOTE: /var/run/docker.sock is intentionally NOT mounted here. - # Docker access goes through the docker-socket-proxy sidecar. - - ${DATA_DIR:-/opt/netbird-instances}:${DATA_DIR:-/opt/netbird-instances} + - ./data:/app/data:z + - ./logs:/app/logs:z + - ./backups:/app/backups:z + - /var/run/docker.sock:/var/run/docker.sock:z + - ${DATA_DIR:-/opt/netbird-instances}:${DATA_DIR:-/opt/netbird-instances}:z environment: - SECRET_KEY=${SECRET_KEY} - DATABASE_PATH=/app/data/netbird_msp.db @@ -57,8 +62,6 @@ services: - DATA_DIR=${DATA_DIR:-/opt/netbird-instances} - DOCKER_NETWORK=${DOCKER_NETWORK:-npm-network} - HOST_IP=${HOST_IP:-} - # Route Docker API calls through the socket proxy instead of the raw socket - - DOCKER_HOST=tcp://docker-socket-proxy:2375 networks: - npm-network healthcheck: