fix: resolve circular import, async blocking, SELinux and delete timeout issues

- Extract shared SlowAPI limiter to app/limiter.py to break circular
  import between app.main and app.routers.auth
- Seed default SystemConfig row (id=1) on first DB init so settings
  page works out of the box
- Make all docker_service.compose_* functions async (run_in_executor)
  so long docker pulls/stops no longer block the async event loop
- Propagate async to netbird_service stop/start/restart and await
  callers in deployments router
- Move customer delete to BackgroundTasks so the HTTP response returns
  immediately and avoids frontend "Network error" on slow machines
- docker-compose: add :z SELinux labels, mount docker.sock directly,
  add security_opt label:disable for socket access, extra_hosts for
  host.docker.internal, enable DELETE/VOLUMES on socket proxy
- npm_service: auto-detect outbound host IP via UDP socket when
  HOST_IP env var is not set

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 00:30:25 +01:00
parent 0ac15e4db9
commit 1bbe4904a7
10 changed files with 102 additions and 53 deletions

View File

@@ -51,6 +51,22 @@ def init_db() -> None:
Base.metadata.create_all(bind=engine)
_run_migrations()
# Insert default SystemConfig row (id=1) if it doesn't exist yet
db = SessionLocal()
try:
if not db.query(SystemConfig).filter(SystemConfig.id == 1).first():
db.add(SystemConfig(
id=1,
base_domain="example.com",
admin_email="admin@example.com",
npm_api_url="http://localhost:81",
npm_api_email_encrypted="",
npm_api_password_encrypted="",
))
db.commit()
finally:
db.close()
def _run_migrations() -> None:
"""Add columns that may be missing from older database versions."""

5
app/limiter.py Normal file
View File

@@ -0,0 +1,5 @@
"""Shared rate limiter instance."""
from slowapi import Limiter
from slowapi.util import get_remote_address
limiter = Limiter(key_func=get_remote_address)

View File

@@ -7,11 +7,11 @@ from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi import _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded
from slowapi.util import get_remote_address
from app.database import init_db
from app.limiter import limiter
from app.routers import auth, customers, deployments, monitoring, settings, users
# ---------------------------------------------------------------------------
@@ -27,11 +27,6 @@ logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Application
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# Rate limiter (SlowAPI)
# ---------------------------------------------------------------------------
limiter = Limiter(key_func=get_remote_address)
# ---------------------------------------------------------------------------
# Application
# ---------------------------------------------------------------------------

View File

@@ -27,8 +27,7 @@ from app.utils.validators import ChangePasswordRequest, LoginRequest, MfaTokenRe
logger = logging.getLogger(__name__)
router = APIRouter()
# Import the shared rate limiter from main
from app.main import limiter
from app.limiter import limiter
@router.post("/login")

View File

@@ -211,12 +211,14 @@ async def update_customer(
@router.delete("/{customer_id}")
async def delete_customer(
customer_id: int,
background_tasks: BackgroundTasks,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db),
):
"""Delete a customer and clean up all resources.
Removes containers, NPM proxy, instance directory, and database records.
Cleanup runs in background so the response returns immediately.
Args:
customer_id: Customer ID.
@@ -231,15 +233,23 @@ async def delete_customer(
detail="Customer not found.",
)
# Undeploy first (containers, NPM, files)
try:
await netbird_service.undeploy_customer(db, customer_id)
except Exception:
logger.exception("Undeploy error for customer %d (continuing with delete)", customer_id)
# Delete customer record (cascades to deployment + logs)
db.delete(customer)
# Mark as deleting immediately so UI reflects the state
customer.status = "inactive"
db.commit()
logger.info("Customer %d deleted by %s.", customer_id, current_user.username)
return {"message": f"Customer {customer_id} deleted successfully."}
async def _delete_in_background(cid: int) -> None:
bg_db = SessionLocal()
try:
await netbird_service.undeploy_customer(bg_db, cid)
c = bg_db.query(Customer).filter(Customer.id == cid).first()
if c:
bg_db.delete(c)
bg_db.commit()
logger.info("Customer %d deleted by %s.", cid, current_user.username)
except Exception:
logger.exception("Background delete failed for customer %d", cid)
finally:
bg_db.close()
background_tasks.add_task(_delete_in_background, customer_id)
return {"message": f"Customer {customer_id} deletion started."}

View File

@@ -72,7 +72,7 @@ async def start_customer(
Result dict.
"""
_require_customer(db, customer_id)
result = netbird_service.start_customer(db, customer_id)
result = await netbird_service.start_customer(db, customer_id)
if not result.get("success"):
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -96,7 +96,7 @@ async def stop_customer(
Result dict.
"""
_require_customer(db, customer_id)
result = netbird_service.stop_customer(db, customer_id)
result = await netbird_service.stop_customer(db, customer_id)
if not result.get("success"):
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -120,7 +120,7 @@ async def restart_customer(
Result dict.
"""
_require_customer(db, customer_id)
result = netbird_service.restart_customer(db, customer_id)
result = await netbird_service.restart_customer(db, customer_id)
if not result.get("success"):
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,

View File

@@ -5,6 +5,7 @@ per-customer Docker Compose stacks. Also provides log retrieval and
container health/status information.
"""
import asyncio
import logging
import os
import subprocess
@@ -17,6 +18,15 @@ from docker.errors import DockerException, NotFound
logger = logging.getLogger(__name__)
async def _run_cmd(cmd: list[str], timeout: int = 120) -> subprocess.CompletedProcess:
"""Run a subprocess command in a thread pool to avoid blocking the event loop."""
loop = asyncio.get_event_loop()
return await loop.run_in_executor( # type: ignore[arg-type]
None,
lambda: subprocess.run(cmd, capture_output=True, text=True, timeout=timeout),
)
def _get_client() -> docker.DockerClient:
"""Return a Docker client connected via the Unix socket.
@@ -26,7 +36,7 @@ def _get_client() -> docker.DockerClient:
return docker.from_env()
def compose_up(
async def compose_up(
instance_dir: str,
project_name: str,
services: Optional[list[str]] = None,
@@ -63,7 +73,7 @@ def compose_up(
cmd.extend(services)
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
result = await _run_cmd(cmd, timeout=timeout)
if result.returncode != 0:
logger.error("docker compose up failed: %s", result.stderr)
@@ -74,7 +84,7 @@ def compose_up(
return True
def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool:
async def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool:
"""Run ``docker compose down`` for a customer instance.
Args:
@@ -96,14 +106,14 @@ def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = Fa
cmd.append("-v")
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
result = await _run_cmd(cmd)
if result.returncode != 0:
logger.warning("docker compose down returned non-zero: %s", result.stderr)
return True
def compose_stop(instance_dir: str, project_name: str) -> bool:
async def compose_stop(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose stop`` for a customer instance.
Args:
@@ -121,11 +131,11 @@ def compose_stop(instance_dir: str, project_name: str) -> bool:
"stop",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
result = await _run_cmd(cmd)
return result.returncode == 0
def compose_start(instance_dir: str, project_name: str) -> bool:
async def compose_start(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose start`` for a customer instance.
Args:
@@ -143,11 +153,11 @@ def compose_start(instance_dir: str, project_name: str) -> bool:
"start",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
result = await _run_cmd(cmd)
return result.returncode == 0
def compose_restart(instance_dir: str, project_name: str) -> bool:
async def compose_restart(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose restart`` for a customer instance.
Args:
@@ -165,7 +175,7 @@ def compose_restart(instance_dir: str, project_name: str) -> bool:
"restart",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
result = await _run_cmd(cmd)
return result.returncode == 0

View File

@@ -204,14 +204,14 @@ async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
# Step 5b: Stop existing containers if re-deploying
if existing_deployment:
try:
docker_service.compose_down(instance_dir, container_prefix, remove_volumes=False)
await docker_service.compose_down(instance_dir, container_prefix, remove_volumes=False)
_log_action(db, customer_id, "deploy", "info",
"Stopped existing containers for re-deployment.")
except Exception as exc:
logger.warning("Could not stop existing containers: %s", exc)
# Step 6: Start all Docker containers
docker_service.compose_up(instance_dir, container_prefix, timeout=120)
await docker_service.compose_up(instance_dir, container_prefix, timeout=120)
_log_action(db, customer_id, "deploy", "info", "Docker containers started.")
# Step 7: Wait for containers to be healthy
@@ -373,7 +373,7 @@ async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
# Rollback: stop containers if they were started
try:
docker_service.compose_down(
await docker_service.compose_down(
instance_dir or os.path.join(config.data_dir, f"kunde{customer_id}"),
container_prefix,
remove_volumes=True,
@@ -414,7 +414,7 @@ async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
# Stop and remove containers
try:
docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True)
await docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True)
_log_action(db, customer_id, "undeploy", "info", "Containers removed.")
except Exception as exc:
_log_action(db, customer_id, "undeploy", "error", f"Container removal error: {exc}")
@@ -457,7 +457,7 @@ async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
return {"success": True}
def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
async def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Stop containers for a customer."""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
@@ -465,7 +465,7 @@ def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
return {"success": False, "error": "Deployment or config not found."}
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
ok = docker_service.compose_stop(instance_dir, deployment.container_prefix)
ok = await docker_service.compose_stop(instance_dir, deployment.container_prefix)
if ok:
deployment.deployment_status = "stopped"
customer = db.query(Customer).filter(Customer.id == customer_id).first()
@@ -478,7 +478,7 @@ def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
return {"success": ok}
def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
async def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Start containers for a customer."""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
@@ -486,7 +486,7 @@ def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
return {"success": False, "error": "Deployment or config not found."}
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
ok = docker_service.compose_start(instance_dir, deployment.container_prefix)
ok = await docker_service.compose_start(instance_dir, deployment.container_prefix)
if ok:
deployment.deployment_status = "running"
customer = db.query(Customer).filter(Customer.id == customer_id).first()
@@ -499,7 +499,7 @@ def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
return {"success": ok}
def restart_customer(db: Session, customer_id: int) -> dict[str, Any]:
async def restart_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Restart containers for a customer."""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
@@ -507,7 +507,7 @@ def restart_customer(db: Session, customer_id: int) -> dict[str, Any]:
return {"success": False, "error": "Deployment or config not found."}
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
ok = docker_service.compose_restart(instance_dir, deployment.container_prefix)
ok = await docker_service.compose_restart(instance_dir, deployment.container_prefix)
if ok:
deployment.deployment_status = "running"
customer = db.query(Customer).filter(Customer.id == customer_id).first()

View File

@@ -14,6 +14,7 @@ Also manages NPM streams for STUN/TURN relay UDP ports.
import logging
import os
import socket
from typing import Any
import httpx
@@ -41,7 +42,17 @@ def _get_forward_host() -> str:
logger.info("Using HOST_IP from environment: %s", host_ip)
return host_ip
logger.warning("HOST_IP not set in environment — please add HOST_IP=<your-server-ip> to .env")
# Auto-detect: connect to external address to find the outbound interface IP
try:
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(("8.8.8.8", 80))
detected = s.getsockname()[0]
logger.info("Auto-detected host IP: %s (set HOST_IP in .env to override)", detected)
return detected
except Exception:
pass
logger.warning("Could not detect host IP — falling back to 127.0.0.1. Set HOST_IP in .env!")
return "127.0.0.1"

View File

@@ -15,6 +15,9 @@ services:
INFO: 1
# Write endpoints (needed for compose up/down/start/stop)
POST: 1
DELETE: 1
# Volumes needed for docker compose (creates/removes volumes per customer)
VOLUMES: 1
# Explicitly deny dangerous endpoints
AUTH: 0
SECRETS: 0
@@ -24,13 +27,12 @@ services:
TASKS: 0
CONFIGS: 0
PLUGINS: 0
VOLUMES: 0
BUILD: 0
COMMIT: 0
DISTRIBUTION: 0
EXEC: 1
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- /var/run/docker.sock:/var/run/docker.sock:ro,z
networks:
- npm-network
# Only accessible from within the Docker network — never expose port externally
@@ -39,17 +41,20 @@ services:
build: .
container_name: netbird-msp-appliance
restart: unless-stopped
security_opt:
- label:disable
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
- docker-socket-proxy
ports:
- "${WEB_UI_PORT:-8000}:8000"
volumes:
- ./data:/app/data
- ./logs:/app/logs
- ./backups:/app/backups
# NOTE: /var/run/docker.sock is intentionally NOT mounted here.
# Docker access goes through the docker-socket-proxy sidecar.
- ${DATA_DIR:-/opt/netbird-instances}:${DATA_DIR:-/opt/netbird-instances}
- ./data:/app/data:z
- ./logs:/app/logs:z
- ./backups:/app/backups:z
- /var/run/docker.sock:/var/run/docker.sock:z
- ${DATA_DIR:-/opt/netbird-instances}:${DATA_DIR:-/opt/netbird-instances}:z
environment:
- SECRET_KEY=${SECRET_KEY}
- DATABASE_PATH=/app/data/netbird_msp.db
@@ -57,8 +62,6 @@ services:
- DATA_DIR=${DATA_DIR:-/opt/netbird-instances}
- DOCKER_NETWORK=${DOCKER_NETWORK:-npm-network}
- HOST_IP=${HOST_IP:-}
# Route Docker API calls through the socket proxy instead of the raw socket
- DOCKER_HOST=tcp://docker-socket-proxy:2375
networks:
- npm-network
healthcheck: