First Build alpha 0.1

2026-02-07 12:18:20 +01:00
parent 29e83436b2
commit 42a3cc9d9f
36 changed files with 4982 additions and 51 deletions
--- a/app/services/init.py
+++ b/app/services/init.py
--- a/app/services/docker_service.py
+++ b/app/services/docker_service.py
@@ -0,0 +1,334 @@
+"""Docker container management via the Python Docker SDK.
+
+Responsible for creating, starting, stopping, restarting, and removing
+per-customer Docker Compose stacks. Also provides log retrieval and
+container health/status information.
+"""
+
+import logging
+import os
+import subprocess
+import time
+from typing import Any, Optional
+
+import docker
+from docker.errors import DockerException, NotFound
+
+logger = logging.getLogger(__name__)
+
+
+def _get_client() -> docker.DockerClient:
+    """Return a Docker client connected via the Unix socket.
+
+    Returns:
+        docker.DockerClient instance.
+    """
+    return docker.from_env()
+
+
+def compose_up(instance_dir: str, project_name: str) -> bool:
+    """Run ``docker compose up -d`` for a customer instance.
+
+    Args:
+        instance_dir: Absolute path to the customer's instance directory.
+        project_name: Docker Compose project name (e.g. ``netbird-kunde5``).
+
+    Returns:
+        True on success.
+
+    Raises:
+        RuntimeError: If ``docker compose up`` fails.
+    """
+    compose_file = os.path.join(instance_dir, "docker-compose.yml")
+    if not os.path.isfile(compose_file):
+        raise FileNotFoundError(f"docker-compose.yml not found at {compose_file}")
+
+    cmd = [
+        "docker", "compose",
+        "-f", compose_file,
+        "-p", project_name,
+        "up", "-d", "--remove-orphans",
+    ]
+    logger.info("Running: %s", " ".join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+
+    if result.returncode != 0:
+        logger.error("docker compose up failed: %s", result.stderr)
+        raise RuntimeError(f"docker compose up failed: {result.stderr}")
+
+    logger.info("docker compose up succeeded for %s", project_name)
+    return True
+
+
+def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool:
+    """Run ``docker compose down`` for a customer instance.
+
+    Args:
+        instance_dir: Absolute path to the customer's instance directory.
+        project_name: Docker Compose project name.
+        remove_volumes: Whether to also remove volumes.
+
+    Returns:
+        True on success.
+    """
+    compose_file = os.path.join(instance_dir, "docker-compose.yml")
+    cmd = [
+        "docker", "compose",
+        "-f", compose_file,
+        "-p", project_name,
+        "down",
+    ]
+    if remove_volumes:
+        cmd.append("-v")
+
+    logger.info("Running: %s", " ".join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+
+    if result.returncode != 0:
+        logger.warning("docker compose down returned non-zero: %s", result.stderr)
+    return True
+
+
+def compose_stop(instance_dir: str, project_name: str) -> bool:
+    """Run ``docker compose stop`` for a customer instance.
+
+    Args:
+        instance_dir: Absolute path to the customer's instance directory.
+        project_name: Docker Compose project name.
+
+    Returns:
+        True on success.
+    """
+    compose_file = os.path.join(instance_dir, "docker-compose.yml")
+    cmd = [
+        "docker", "compose",
+        "-f", compose_file,
+        "-p", project_name,
+        "stop",
+    ]
+    logger.info("Running: %s", " ".join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    return result.returncode == 0
+
+
+def compose_start(instance_dir: str, project_name: str) -> bool:
+    """Run ``docker compose start`` for a customer instance.
+
+    Args:
+        instance_dir: Absolute path to the customer's instance directory.
+        project_name: Docker Compose project name.
+
+    Returns:
+        True on success.
+    """
+    compose_file = os.path.join(instance_dir, "docker-compose.yml")
+    cmd = [
+        "docker", "compose",
+        "-f", compose_file,
+        "-p", project_name,
+        "start",
+    ]
+    logger.info("Running: %s", " ".join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    return result.returncode == 0
+
+
+def compose_restart(instance_dir: str, project_name: str) -> bool:
+    """Run ``docker compose restart`` for a customer instance.
+
+    Args:
+        instance_dir: Absolute path to the customer's instance directory.
+        project_name: Docker Compose project name.
+
+    Returns:
+        True on success.
+    """
+    compose_file = os.path.join(instance_dir, "docker-compose.yml")
+    cmd = [
+        "docker", "compose",
+        "-f", compose_file,
+        "-p", project_name,
+        "restart",
+    ]
+    logger.info("Running: %s", " ".join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    return result.returncode == 0
+
+
+def get_container_status(container_prefix: str) -> list[dict[str, Any]]:
+    """Get the status of all containers matching a prefix.
+
+    Args:
+        container_prefix: Container name prefix (e.g. ``netbird-kunde5``).
+
+    Returns:
+        List of dicts with container name, status, and health info.
+    """
+    client = _get_client()
+    results: list[dict[str, Any]] = []
+    try:
+        containers = client.containers.list(all=True, filters={"name": container_prefix})
+        for c in containers:
+            health = "N/A"
+            if c.attrs.get("State", {}).get("Health"):
+                health = c.attrs["State"]["Health"].get("Status", "N/A")
+            results.append({
+                "name": c.name,
+                "status": c.status,
+                "health": health,
+                "image": str(c.image.tags[0]) if c.image.tags else str(c.image.id[:12]),
+                "created": c.attrs.get("Created", ""),
+            })
+    except DockerException as exc:
+        logger.error("Failed to get container status: %s", exc)
+    return results
+
+
+def get_container_logs(container_name: str, tail: int = 200) -> str:
+    """Retrieve recent logs from a container.
+
+    Args:
+        container_name: Full container name.
+        tail: Number of log lines to retrieve.
+
+    Returns:
+        Log text.
+    """
+    client = _get_client()
+    try:
+        container = client.containers.get(container_name)
+        return container.logs(tail=tail, timestamps=True).decode("utf-8", errors="replace")
+    except NotFound:
+        return f"Container {container_name} not found."
+    except DockerException as exc:
+        return f"Error retrieving logs: {exc}"
+
+
+def get_all_container_logs(container_prefix: str, tail: int = 100) -> dict[str, str]:
+    """Get logs for all containers matching a prefix.
+
+    Args:
+        container_prefix: Container name prefix.
+        tail: Lines per container.
+
+    Returns:
+        Dict mapping container name to log text.
+    """
+    client = _get_client()
+    logs: dict[str, str] = {}
+    try:
+        containers = client.containers.list(all=True, filters={"name": container_prefix})
+        for c in containers:
+            try:
+                logs[c.name] = c.logs(tail=tail, timestamps=True).decode(
+                    "utf-8", errors="replace"
+                )
+            except DockerException:
+                logs[c.name] = "Error retrieving logs."
+    except DockerException as exc:
+        logger.error("Failed to list containers: %s", exc)
+    return logs
+
+
+def wait_for_healthy(container_prefix: str, timeout: int = 60) -> bool:
+    """Wait until all containers with the given prefix are running.
+
+    Args:
+        container_prefix: Container name prefix.
+        timeout: Maximum seconds to wait.
+
+    Returns:
+        True if all containers started within timeout.
+    """
+    client = _get_client()
+    deadline = time.time() + timeout
+
+    while time.time() < deadline:
+        try:
+            containers = client.containers.list(
+                all=True, filters={"name": container_prefix}
+            )
+            if not containers:
+                time.sleep(2)
+                continue
+
+            all_running = all(c.status == "running" for c in containers)
+            if all_running:
+                logger.info("All containers for %s are running.", container_prefix)
+                return True
+        except DockerException as exc:
+            logger.warning("Health check error: %s", exc)
+
+        time.sleep(3)
+
+    logger.warning("Timeout waiting for %s containers to start.", container_prefix)
+    return False
+
+
+def get_docker_stats(container_prefix: str) -> list[dict[str, Any]]:
+    """Retrieve resource usage stats for containers matching a prefix.
+
+    Args:
+        container_prefix: Container name prefix.
+
+    Returns:
+        List of dicts with CPU, memory, and network stats.
+    """
+    client = _get_client()
+    stats_list: list[dict[str, Any]] = []
+    try:
+        containers = client.containers.list(filters={"name": container_prefix})
+        for c in containers:
+            try:
+                raw = c.stats(stream=False)
+                cpu_delta = (
+                    raw.get("cpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
+                    - raw.get("precpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
+                )
+                system_delta = (
+                    raw.get("cpu_stats", {}).get("system_cpu_usage", 0)
+                    - raw.get("precpu_stats", {}).get("system_cpu_usage", 0)
+                )
+                num_cpus = len(
+                    raw.get("cpu_stats", {}).get("cpu_usage", {}).get("percpu_usage", [1])
+                )
+                cpu_pct = 0.0
+                if system_delta > 0:
+                    cpu_pct = (cpu_delta / system_delta) * num_cpus * 100
+
+                mem_usage = raw.get("memory_stats", {}).get("usage", 0)
+                mem_limit = raw.get("memory_stats", {}).get("limit", 1)
+
+                stats_list.append({
+                    "name": c.name,
+                    "cpu_percent": round(cpu_pct, 2),
+                    "memory_usage_mb": round(mem_usage / 1024 / 1024, 1),
+                    "memory_limit_mb": round(mem_limit / 1024 / 1024, 1),
+                    "memory_percent": round((mem_usage / mem_limit) * 100, 1) if mem_limit else 0,
+                })
+            except DockerException:
+                stats_list.append({"name": c.name, "error": "Failed to get stats"})
+    except DockerException as exc:
+        logger.error("Failed to get docker stats: %s", exc)
+    return stats_list
+
+
+def remove_instance_containers(container_prefix: str) -> bool:
+    """Force-remove all containers matching a prefix.
+
+    Args:
+        container_prefix: Container name prefix.
+
+    Returns:
+        True if removal succeeded.
+    """
+    client = _get_client()
+    try:
+        containers = client.containers.list(all=True, filters={"name": container_prefix})
+        for c in containers:
+            logger.info("Removing container %s", c.name)
+            c.remove(force=True)
+        return True
+    except DockerException as exc:
+        logger.error("Failed to remove containers: %s", exc)
+        return False
--- a/app/services/netbird_service.py
+++ b/app/services/netbird_service.py
@@ -0,0 +1,396 @@
+"""NetBird deployment orchestration service.
+
+Coordinates the full customer deployment lifecycle:
+1. Validate inputs
+2. Allocate ports
+3. Generate configs from Jinja2 templates
+4. Create instance directory and write files
+5. Start Docker containers
+6. Wait for health checks
+7. Create NPM proxy hosts
+8. Update database
+
+Includes comprehensive rollback on failure.
+"""
+
+import logging
+import os
+import shutil
+from datetime import datetime
+from typing import Any
+
+from jinja2 import Environment, FileSystemLoader
+from sqlalchemy.orm import Session
+
+from app.models import Customer, Deployment, DeploymentLog, SystemConfig
+from app.services import docker_service, npm_service, port_manager
+from app.utils.config import get_system_config
+from app.utils.security import encrypt_value, generate_relay_secret
+
+logger = logging.getLogger(__name__)
+
+# Path to Jinja2 templates
+TEMPLATE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "templates")
+
+
+def _get_jinja_env() -> Environment:
+    """Create a Jinja2 environment for template rendering."""
+    return Environment(
+        loader=FileSystemLoader(TEMPLATE_DIR),
+        keep_trailing_newline=True,
+    )
+
+
+def _log_action(
+    db: Session, customer_id: int, action: str, status: str, message: str, details: str = ""
+) -> None:
+    """Write a deployment log entry.
+
+    Args:
+        db: Active session.
+        customer_id: The customer this log belongs to.
+        action: Action name (e.g. ``deploy``, ``stop``).
+        status: ``success``, ``error``, or ``info``.
+        message: Human-readable message.
+        details: Additional details (optional).
+    """
+    log = DeploymentLog(
+        customer_id=customer_id,
+        action=action,
+        status=status,
+        message=message,
+        details=details,
+    )
+    db.add(log)
+    db.commit()
+
+
+async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
+    """Execute the full deployment workflow for a customer.
+
+    Args:
+        db: Active session.
+        customer_id: Customer to deploy.
+
+    Returns:
+        Dict with ``success``, ``setup_url``, or ``error``.
+    """
+    customer = db.query(Customer).filter(Customer.id == customer_id).first()
+    if not customer:
+        return {"success": False, "error": "Customer not found."}
+
+    config = get_system_config(db)
+    if not config:
+        return {"success": False, "error": "System not configured. Please set up system settings first."}
+
+    # Update status to deploying
+    customer.status = "deploying"
+    db.commit()
+
+    _log_action(db, customer_id, "deploy", "info", "Deployment started.")
+
+    allocated_port = None
+    instance_dir = None
+    container_prefix = f"netbird-kunde{customer_id}"
+
+    try:
+        # Step 1: Allocate relay UDP port
+        allocated_port = port_manager.allocate_port(db, config.relay_base_port)
+        _log_action(db, customer_id, "deploy", "info", f"Allocated UDP port {allocated_port}.")
+
+        # Step 2: Generate relay secret
+        relay_secret = generate_relay_secret()
+
+        # Step 3: Create instance directory
+        instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
+        os.makedirs(instance_dir, exist_ok=True)
+        os.makedirs(os.path.join(instance_dir, "data", "management"), exist_ok=True)
+        os.makedirs(os.path.join(instance_dir, "data", "signal"), exist_ok=True)
+        _log_action(db, customer_id, "deploy", "info", f"Created directory {instance_dir}.")
+
+        # Step 4: Render templates
+        jinja_env = _get_jinja_env()
+        template_vars = {
+            "customer_id": customer_id,
+            "subdomain": customer.subdomain,
+            "base_domain": config.base_domain,
+            "instance_dir": instance_dir,
+            "relay_udp_port": allocated_port,
+            "relay_secret": relay_secret,
+            "netbird_management_image": config.netbird_management_image,
+            "netbird_signal_image": config.netbird_signal_image,
+            "netbird_relay_image": config.netbird_relay_image,
+            "netbird_dashboard_image": config.netbird_dashboard_image,
+            "docker_network": config.docker_network,
+        }
+
+        # docker-compose.yml
+        dc_template = jinja_env.get_template("docker-compose.yml.j2")
+        dc_content = dc_template.render(**template_vars)
+        with open(os.path.join(instance_dir, "docker-compose.yml"), "w") as f:
+            f.write(dc_content)
+
+        # management.json
+        mgmt_template = jinja_env.get_template("management.json.j2")
+        mgmt_content = mgmt_template.render(**template_vars)
+        with open(os.path.join(instance_dir, "management.json"), "w") as f:
+            f.write(mgmt_content)
+
+        # relay.env
+        relay_template = jinja_env.get_template("relay.env.j2")
+        relay_content = relay_template.render(**template_vars)
+        with open(os.path.join(instance_dir, "relay.env"), "w") as f:
+            f.write(relay_content)
+
+        _log_action(db, customer_id, "deploy", "info", "Configuration files generated.")
+
+        # Step 5: Start Docker containers
+        docker_service.compose_up(instance_dir, container_prefix)
+        _log_action(db, customer_id, "deploy", "info", "Docker containers started.")
+
+        # Step 6: Wait for containers to be healthy
+        healthy = docker_service.wait_for_healthy(container_prefix, timeout=60)
+        if not healthy:
+            _log_action(
+                db, customer_id, "deploy", "error",
+                "Containers did not become healthy within 60 seconds."
+            )
+            # Don't fail completely — containers might still come up
+
+        # Step 7: Create NPM proxy host
+        domain = f"{customer.subdomain}.{config.base_domain}"
+        dashboard_container = f"netbird-kunde{customer_id}-dashboard"
+        npm_result = await npm_service.create_proxy_host(
+            api_url=config.npm_api_url,
+            api_token=config.npm_api_token,
+            domain=domain,
+            forward_host=dashboard_container,
+            forward_port=80,
+            admin_email=config.admin_email,
+            subdomain=customer.subdomain,
+            customer_id=customer_id,
+        )
+
+        npm_proxy_id = npm_result.get("proxy_id")
+        if npm_result.get("error"):
+            _log_action(
+                db, customer_id, "deploy", "error",
+                f"NPM proxy creation failed: {npm_result['error']}",
+            )
+            # Continue — deployment works without NPM, admin can fix later
+
+        # Step 8: Create deployment record
+        setup_url = f"https://{domain}"
+        deployment = Deployment(
+            customer_id=customer_id,
+            container_prefix=container_prefix,
+            relay_udp_port=allocated_port,
+            npm_proxy_id=npm_proxy_id,
+            relay_secret=encrypt_value(relay_secret),
+            setup_url=setup_url,
+            deployment_status="running",
+            deployed_at=datetime.utcnow(),
+        )
+        db.add(deployment)
+
+        customer.status = "active"
+        db.commit()
+
+        _log_action(db, customer_id, "deploy", "success", f"Deployment complete. URL: {setup_url}")
+
+        return {"success": True, "setup_url": setup_url}
+
+    except Exception as exc:
+        logger.exception("Deployment failed for customer %d", customer_id)
+
+        # Rollback: stop containers if they were started
+        try:
+            docker_service.compose_down(
+                instance_dir or os.path.join(config.data_dir, f"kunde{customer_id}"),
+                container_prefix,
+                remove_volumes=True,
+            )
+        except Exception:
+            pass
+
+        # Rollback: remove instance directory
+        if instance_dir and os.path.isdir(instance_dir):
+            try:
+                shutil.rmtree(instance_dir)
+            except Exception:
+                pass
+
+        customer.status = "error"
+        db.commit()
+
+        _log_action(
+            db, customer_id, "deploy", "error",
+            f"Deployment failed: {exc}",
+            details=str(exc),
+        )
+
+        return {"success": False, "error": str(exc)}
+
+
+async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
+    """Remove all resources for a customer deployment.
+
+    Args:
+        db: Active session.
+        customer_id: Customer to undeploy.
+
+    Returns:
+        Dict with ``success`` bool.
+    """
+    customer = db.query(Customer).filter(Customer.id == customer_id).first()
+    if not customer:
+        return {"success": False, "error": "Customer not found."}
+
+    deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
+    config = get_system_config(db)
+
+    if deployment and config:
+        instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
+
+        # Stop and remove containers
+        try:
+            docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True)
+            _log_action(db, customer_id, "undeploy", "info", "Containers removed.")
+        except Exception as exc:
+            _log_action(db, customer_id, "undeploy", "error", f"Container removal error: {exc}")
+
+        # Remove NPM proxy host
+        if deployment.npm_proxy_id and config.npm_api_token:
+            try:
+                await npm_service.delete_proxy_host(
+                    config.npm_api_url, config.npm_api_token, deployment.npm_proxy_id
+                )
+                _log_action(db, customer_id, "undeploy", "info", "NPM proxy host removed.")
+            except Exception as exc:
+                _log_action(db, customer_id, "undeploy", "error", f"NPM removal error: {exc}")
+
+        # Remove instance directory
+        if os.path.isdir(instance_dir):
+            try:
+                shutil.rmtree(instance_dir)
+                _log_action(db, customer_id, "undeploy", "info", "Instance directory removed.")
+            except Exception as exc:
+                _log_action(db, customer_id, "undeploy", "error", f"Directory removal error: {exc}")
+
+        # Remove deployment record
+        db.delete(deployment)
+        db.commit()
+
+    _log_action(db, customer_id, "undeploy", "success", "Undeployment complete.")
+    return {"success": True}
+
+
+def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
+    """Stop containers for a customer.
+
+    Args:
+        db: Active session.
+        customer_id: Customer whose containers to stop.
+
+    Returns:
+        Dict with ``success`` bool.
+    """
+    deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
+    config = get_system_config(db)
+    if not deployment or not config:
+        return {"success": False, "error": "Deployment or config not found."}
+
+    instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
+    ok = docker_service.compose_stop(instance_dir, deployment.container_prefix)
+    if ok:
+        deployment.deployment_status = "stopped"
+        db.commit()
+        _log_action(db, customer_id, "stop", "success", "Containers stopped.")
+    else:
+        _log_action(db, customer_id, "stop", "error", "Failed to stop containers.")
+    return {"success": ok}
+
+
+def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
+    """Start containers for a customer.
+
+    Args:
+        db: Active session.
+        customer_id: Customer whose containers to start.
+
+    Returns:
+        Dict with ``success`` bool.
+    """
+    deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
+    config = get_system_config(db)
+    if not deployment or not config:
+        return {"success": False, "error": "Deployment or config not found."}
+
+    instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
+    ok = docker_service.compose_start(instance_dir, deployment.container_prefix)
+    if ok:
+        deployment.deployment_status = "running"
+        db.commit()
+        _log_action(db, customer_id, "start", "success", "Containers started.")
+    else:
+        _log_action(db, customer_id, "start", "error", "Failed to start containers.")
+    return {"success": ok}
+
+
+def restart_customer(db: Session, customer_id: int) -> dict[str, Any]:
+    """Restart containers for a customer.
+
+    Args:
+        db: Active session.
+        customer_id: Customer whose containers to restart.
+
+    Returns:
+        Dict with ``success`` bool.
+    """
+    deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
+    config = get_system_config(db)
+    if not deployment or not config:
+        return {"success": False, "error": "Deployment or config not found."}
+
+    instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
+    ok = docker_service.compose_restart(instance_dir, deployment.container_prefix)
+    if ok:
+        deployment.deployment_status = "running"
+        db.commit()
+        _log_action(db, customer_id, "restart", "success", "Containers restarted.")
+    else:
+        _log_action(db, customer_id, "restart", "error", "Failed to restart containers.")
+    return {"success": ok}
+
+
+def get_customer_health(db: Session, customer_id: int) -> dict[str, Any]:
+    """Check health of a customer's deployment.
+
+    Args:
+        db: Active session.
+        customer_id: Customer ID.
+
+    Returns:
+        Dict with container statuses and overall health.
+    """
+    deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
+    if not deployment:
+        return {"healthy": False, "error": "No deployment found.", "containers": []}
+
+    containers = docker_service.get_container_status(deployment.container_prefix)
+    all_running = all(c["status"] == "running" for c in containers) if containers else False
+
+    # Update last health check time
+    deployment.last_health_check = datetime.utcnow()
+    if all_running:
+        deployment.deployment_status = "running"
+    elif containers:
+        deployment.deployment_status = "failed"
+    db.commit()
+
+    return {
+        "healthy": all_running,
+        "containers": containers,
+        "deployment_status": deployment.deployment_status,
+        "last_check": deployment.last_health_check.isoformat(),
+    }
--- a/app/services/npm_service.py
+++ b/app/services/npm_service.py
@@ -0,0 +1,234 @@
+"""Nginx Proxy Manager API integration.
+
+Creates, updates, and deletes proxy host entries so each customer's NetBird
+dashboard is accessible at ``{subdomain}.{base_domain}`` with automatic
+Let's Encrypt SSL certificates.
+"""
+
+import logging
+from typing import Any, Optional
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+# Timeout for NPM API calls (seconds)
+NPM_TIMEOUT = 30
+
+
+async def test_npm_connection(api_url: str, api_token: str) -> dict[str, Any]:
+    """Test connectivity to the Nginx Proxy Manager API.
+
+    Args:
+        api_url: NPM API base URL (e.g. ``http://npm:81/api``).
+        api_token: Bearer token for authentication.
+
+    Returns:
+        Dict with ``ok`` (bool) and ``message`` (str).
+    """
+    headers = {"Authorization": f"Bearer {api_token}"}
+    try:
+        async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
+            resp = await client.get(f"{api_url}/nginx/proxy-hosts", headers=headers)
+            if resp.status_code == 200:
+                count = len(resp.json())
+                return {"ok": True, "message": f"Connected. {count} proxy hosts found."}
+            return {
+                "ok": False,
+                "message": f"NPM returned status {resp.status_code}: {resp.text[:200]}",
+            }
+    except httpx.ConnectError:
+        return {"ok": False, "message": "Connection refused. Is NPM running?"}
+    except httpx.TimeoutException:
+        return {"ok": False, "message": "Connection timed out."}
+    except Exception as exc:
+        return {"ok": False, "message": f"Unexpected error: {exc}"}
+
+
+async def create_proxy_host(
+    api_url: str,
+    api_token: str,
+    domain: str,
+    forward_host: str,
+    forward_port: int = 80,
+    admin_email: str = "",
+    subdomain: str = "",
+    customer_id: int = 0,
+) -> dict[str, Any]:
+    """Create a proxy host entry in NPM with SSL for a customer.
+
+    The proxy routes traffic as follows:
+    - ``/``            -> dashboard container (port 80)
+    - ``/api``         -> management container (port 80)
+    - ``/signalexchange.*`` -> signal container (port 80)
+    - ``/relay``       -> relay container (port 80)
+
+    Args:
+        api_url: NPM API base URL.
+        api_token: Bearer token.
+        domain: Full domain (e.g. ``kunde1.example.com``).
+        forward_host: Container name for the dashboard.
+        forward_port: Port to forward to (default 80).
+        admin_email: Email for Let's Encrypt.
+        subdomain: Customer subdomain for building container names.
+        customer_id: Customer ID for building container names.
+
+    Returns:
+        Dict with ``proxy_id`` on success or ``error`` on failure.
+    """
+    headers = {
+        "Authorization": f"Bearer {api_token}",
+        "Content-Type": "application/json",
+    }
+
+    # Build advanced Nginx config to route sub-paths to different containers
+    mgmt_container = f"netbird-kunde{customer_id}-management"
+    signal_container = f"netbird-kunde{customer_id}-signal"
+    relay_container = f"netbird-kunde{customer_id}-relay"
+
+    advanced_config = f"""
+# NetBird Management API
+location /api {{
+    proxy_pass http://{mgmt_container}:80;
+    proxy_set_header Host $host;
+    proxy_set_header X-Real-IP $remote_addr;
+    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+    proxy_set_header X-Forwarded-Proto $scheme;
+}}
+
+# NetBird Signal (gRPC-Web)
+location /signalexchange. {{
+    grpc_pass grpc://{signal_container}:80;
+    grpc_set_header Host $host;
+}}
+
+# NetBird Relay (WebSocket)
+location /relay {{
+    proxy_pass http://{relay_container}:80;
+    proxy_http_version 1.1;
+    proxy_set_header Upgrade $http_upgrade;
+    proxy_set_header Connection "upgrade";
+    proxy_set_header Host $host;
+    proxy_set_header X-Real-IP $remote_addr;
+}}
+"""
+
+    payload = {
+        "domain_names": [domain],
+        "forward_scheme": "http",
+        "forward_host": forward_host,
+        "forward_port": forward_port,
+        "certificate_id": 0,
+        "ssl_forced": True,
+        "hsts_enabled": True,
+        "hsts_subdomains": False,
+        "http2_support": True,
+        "block_exploits": True,
+        "allow_websocket_upgrade": True,
+        "access_list_id": 0,
+        "advanced_config": advanced_config.strip(),
+        "meta": {
+            "letsencrypt_agree": True,
+            "letsencrypt_email": admin_email,
+            "dns_challenge": False,
+        },
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
+            resp = await client.post(
+                f"{api_url}/nginx/proxy-hosts", json=payload, headers=headers
+            )
+            if resp.status_code in (200, 201):
+                data = resp.json()
+                proxy_id = data.get("id")
+                logger.info("Created NPM proxy host %s (id=%s)", domain, proxy_id)
+
+                # Request SSL certificate
+                await _request_ssl(client, api_url, headers, proxy_id, domain, admin_email)
+
+                return {"proxy_id": proxy_id}
+            else:
+                error_msg = f"NPM returned {resp.status_code}: {resp.text[:300]}"
+                logger.error("Failed to create proxy host: %s", error_msg)
+                return {"error": error_msg}
+    except Exception as exc:
+        logger.error("NPM API error: %s", exc)
+        return {"error": str(exc)}
+
+
+async def _request_ssl(
+    client: httpx.AsyncClient,
+    api_url: str,
+    headers: dict,
+    proxy_id: int,
+    domain: str,
+    admin_email: str,
+) -> None:
+    """Request a Let's Encrypt SSL certificate for a proxy host.
+
+    Args:
+        client: httpx client.
+        api_url: NPM API base URL.
+        headers: Auth headers.
+        proxy_id: The proxy host ID.
+        domain: The domain to certify.
+        admin_email: Contact email for LE.
+    """
+    ssl_payload = {
+        "domain_names": [domain],
+        "meta": {
+            "letsencrypt_agree": True,
+            "letsencrypt_email": admin_email,
+            "dns_challenge": False,
+        },
+    }
+    try:
+        resp = await client.post(
+            f"{api_url}/nginx/certificates", json=ssl_payload, headers=headers
+        )
+        if resp.status_code in (200, 201):
+            cert_id = resp.json().get("id")
+            # Assign certificate to proxy host
+            await client.put(
+                f"{api_url}/nginx/proxy-hosts/{proxy_id}",
+                json={"certificate_id": cert_id},
+                headers=headers,
+            )
+            logger.info("SSL certificate assigned to proxy host %s", proxy_id)
+        else:
+            logger.warning("SSL request returned %s: %s", resp.status_code, resp.text[:200])
+    except Exception as exc:
+        logger.warning("SSL certificate request failed: %s", exc)
+
+
+async def delete_proxy_host(
+    api_url: str, api_token: str, proxy_id: int
+) -> bool:
+    """Delete a proxy host from NPM.
+
+    Args:
+        api_url: NPM API base URL.
+        api_token: Bearer token.
+        proxy_id: The proxy host ID to delete.
+
+    Returns:
+        True on success.
+    """
+    headers = {"Authorization": f"Bearer {api_token}"}
+    try:
+        async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
+            resp = await client.delete(
+                f"{api_url}/nginx/proxy-hosts/{proxy_id}", headers=headers
+            )
+            if resp.status_code in (200, 204):
+                logger.info("Deleted NPM proxy host %d", proxy_id)
+                return True
+            logger.warning(
+                "Failed to delete proxy host %d: %s %s",
+                proxy_id, resp.status_code, resp.text[:200],
+            )
+            return False
+    except Exception as exc:
+        logger.error("NPM delete error: %s", exc)
+        return False
--- a/app/services/port_manager.py
+++ b/app/services/port_manager.py
@@ -0,0 +1,110 @@
+"""UDP port allocation service for NetBird relay/STUN ports.
+
+Manages the range starting at relay_base_port (default 3478). Each customer
+gets one unique UDP port. The manager checks both the database and the OS
+to avoid collisions.
+"""
+
+import logging
+import socket
+from typing import Optional
+
+from sqlalchemy.orm import Session
+
+from app.models import Deployment
+
+logger = logging.getLogger(__name__)
+
+
+def _is_udp_port_in_use(port: int) -> bool:
+    """Check whether a UDP port is currently bound on the host.
+
+    Args:
+        port: UDP port number to probe.
+
+    Returns:
+        True if the port is in use.
+    """
+    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        sock.bind(("0.0.0.0", port))
+        return False
+    except OSError:
+        return True
+    finally:
+        sock.close()
+
+
+def get_allocated_ports(db: Session) -> set[int]:
+    """Return the set of relay UDP ports already assigned in the database.
+
+    Args:
+        db: Active SQLAlchemy session.
+
+    Returns:
+        Set of port numbers.
+    """
+    rows = db.query(Deployment.relay_udp_port).all()
+    return {r[0] for r in rows}
+
+
+def allocate_port(db: Session, base_port: int = 3478, max_ports: int = 100) -> int:
+    """Find and return the next available relay UDP port.
+
+    Scans from *base_port* to *base_port + max_ports - 1*, skipping ports
+    that are either already in the database or currently bound on the host.
+
+    Args:
+        db: Active SQLAlchemy session.
+        base_port: Start of the port range.
+        max_ports: Number of ports in the range.
+
+    Returns:
+        An available port number.
+
+    Raises:
+        RuntimeError: If no port in the range is available.
+    """
+    allocated = get_allocated_ports(db)
+    for port in range(base_port, base_port + max_ports):
+        if port in allocated:
+            continue
+        if _is_udp_port_in_use(port):
+            logger.warning("Port %d is in use on the host, skipping.", port)
+            continue
+        logger.info("Allocated relay UDP port %d.", port)
+        return port
+
+    raise RuntimeError(
+        f"No available relay ports in range {base_port}-{base_port + max_ports - 1}. "
+        "All 100 ports are allocated."
+    )
+
+
+def release_port(db: Session, port: int) -> None:
+    """Mark a port as released (informational logging only).
+
+    The actual release happens when the Deployment row is deleted. This
+    helper exists for explicit logging in rollback scenarios.
+
+    Args:
+        db: Active SQLAlchemy session.
+        port: The port to release.
+    """
+    logger.info("Released relay UDP port %d.", port)
+
+
+def validate_port_available(db: Session, port: int) -> bool:
+    """Check if a specific port is available both in DB and on the host.
+
+    Args:
+        db: Active SQLAlchemy session.
+        port: Port number to check.
+
+    Returns:
+        True if the port is available.
+    """
+    allocated = get_allocated_ports(db)
+    if port in allocated:
+        return False
+    return not _is_udp_port_in_use(port)