First Build alpha 0.1

This commit is contained in:
2026-02-07 12:18:20 +01:00
parent 29e83436b2
commit 42a3cc9d9f
36 changed files with 4982 additions and 51 deletions

0
app/services/__init__.py Normal file
View File

View File

@@ -0,0 +1,334 @@
"""Docker container management via the Python Docker SDK.
Responsible for creating, starting, stopping, restarting, and removing
per-customer Docker Compose stacks. Also provides log retrieval and
container health/status information.
"""
import logging
import os
import subprocess
import time
from typing import Any, Optional
import docker
from docker.errors import DockerException, NotFound
logger = logging.getLogger(__name__)
def _get_client() -> docker.DockerClient:
"""Return a Docker client connected via the Unix socket.
Returns:
docker.DockerClient instance.
"""
return docker.from_env()
def compose_up(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose up -d`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name (e.g. ``netbird-kunde5``).
Returns:
True on success.
Raises:
RuntimeError: If ``docker compose up`` fails.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
if not os.path.isfile(compose_file):
raise FileNotFoundError(f"docker-compose.yml not found at {compose_file}")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"up", "-d", "--remove-orphans",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
logger.error("docker compose up failed: %s", result.stderr)
raise RuntimeError(f"docker compose up failed: {result.stderr}")
logger.info("docker compose up succeeded for %s", project_name)
return True
def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool:
"""Run ``docker compose down`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
remove_volumes: Whether to also remove volumes.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"down",
]
if remove_volumes:
cmd.append("-v")
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
logger.warning("docker compose down returned non-zero: %s", result.stderr)
return True
def compose_stop(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose stop`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"stop",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.returncode == 0
def compose_start(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose start`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"start",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.returncode == 0
def compose_restart(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose restart`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"restart",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.returncode == 0
def get_container_status(container_prefix: str) -> list[dict[str, Any]]:
"""Get the status of all containers matching a prefix.
Args:
container_prefix: Container name prefix (e.g. ``netbird-kunde5``).
Returns:
List of dicts with container name, status, and health info.
"""
client = _get_client()
results: list[dict[str, Any]] = []
try:
containers = client.containers.list(all=True, filters={"name": container_prefix})
for c in containers:
health = "N/A"
if c.attrs.get("State", {}).get("Health"):
health = c.attrs["State"]["Health"].get("Status", "N/A")
results.append({
"name": c.name,
"status": c.status,
"health": health,
"image": str(c.image.tags[0]) if c.image.tags else str(c.image.id[:12]),
"created": c.attrs.get("Created", ""),
})
except DockerException as exc:
logger.error("Failed to get container status: %s", exc)
return results
def get_container_logs(container_name: str, tail: int = 200) -> str:
"""Retrieve recent logs from a container.
Args:
container_name: Full container name.
tail: Number of log lines to retrieve.
Returns:
Log text.
"""
client = _get_client()
try:
container = client.containers.get(container_name)
return container.logs(tail=tail, timestamps=True).decode("utf-8", errors="replace")
except NotFound:
return f"Container {container_name} not found."
except DockerException as exc:
return f"Error retrieving logs: {exc}"
def get_all_container_logs(container_prefix: str, tail: int = 100) -> dict[str, str]:
"""Get logs for all containers matching a prefix.
Args:
container_prefix: Container name prefix.
tail: Lines per container.
Returns:
Dict mapping container name to log text.
"""
client = _get_client()
logs: dict[str, str] = {}
try:
containers = client.containers.list(all=True, filters={"name": container_prefix})
for c in containers:
try:
logs[c.name] = c.logs(tail=tail, timestamps=True).decode(
"utf-8", errors="replace"
)
except DockerException:
logs[c.name] = "Error retrieving logs."
except DockerException as exc:
logger.error("Failed to list containers: %s", exc)
return logs
def wait_for_healthy(container_prefix: str, timeout: int = 60) -> bool:
"""Wait until all containers with the given prefix are running.
Args:
container_prefix: Container name prefix.
timeout: Maximum seconds to wait.
Returns:
True if all containers started within timeout.
"""
client = _get_client()
deadline = time.time() + timeout
while time.time() < deadline:
try:
containers = client.containers.list(
all=True, filters={"name": container_prefix}
)
if not containers:
time.sleep(2)
continue
all_running = all(c.status == "running" for c in containers)
if all_running:
logger.info("All containers for %s are running.", container_prefix)
return True
except DockerException as exc:
logger.warning("Health check error: %s", exc)
time.sleep(3)
logger.warning("Timeout waiting for %s containers to start.", container_prefix)
return False
def get_docker_stats(container_prefix: str) -> list[dict[str, Any]]:
"""Retrieve resource usage stats for containers matching a prefix.
Args:
container_prefix: Container name prefix.
Returns:
List of dicts with CPU, memory, and network stats.
"""
client = _get_client()
stats_list: list[dict[str, Any]] = []
try:
containers = client.containers.list(filters={"name": container_prefix})
for c in containers:
try:
raw = c.stats(stream=False)
cpu_delta = (
raw.get("cpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
- raw.get("precpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
)
system_delta = (
raw.get("cpu_stats", {}).get("system_cpu_usage", 0)
- raw.get("precpu_stats", {}).get("system_cpu_usage", 0)
)
num_cpus = len(
raw.get("cpu_stats", {}).get("cpu_usage", {}).get("percpu_usage", [1])
)
cpu_pct = 0.0
if system_delta > 0:
cpu_pct = (cpu_delta / system_delta) * num_cpus * 100
mem_usage = raw.get("memory_stats", {}).get("usage", 0)
mem_limit = raw.get("memory_stats", {}).get("limit", 1)
stats_list.append({
"name": c.name,
"cpu_percent": round(cpu_pct, 2),
"memory_usage_mb": round(mem_usage / 1024 / 1024, 1),
"memory_limit_mb": round(mem_limit / 1024 / 1024, 1),
"memory_percent": round((mem_usage / mem_limit) * 100, 1) if mem_limit else 0,
})
except DockerException:
stats_list.append({"name": c.name, "error": "Failed to get stats"})
except DockerException as exc:
logger.error("Failed to get docker stats: %s", exc)
return stats_list
def remove_instance_containers(container_prefix: str) -> bool:
"""Force-remove all containers matching a prefix.
Args:
container_prefix: Container name prefix.
Returns:
True if removal succeeded.
"""
client = _get_client()
try:
containers = client.containers.list(all=True, filters={"name": container_prefix})
for c in containers:
logger.info("Removing container %s", c.name)
c.remove(force=True)
return True
except DockerException as exc:
logger.error("Failed to remove containers: %s", exc)
return False

View File

@@ -0,0 +1,396 @@
"""NetBird deployment orchestration service.
Coordinates the full customer deployment lifecycle:
1. Validate inputs
2. Allocate ports
3. Generate configs from Jinja2 templates
4. Create instance directory and write files
5. Start Docker containers
6. Wait for health checks
7. Create NPM proxy hosts
8. Update database
Includes comprehensive rollback on failure.
"""
import logging
import os
import shutil
from datetime import datetime
from typing import Any
from jinja2 import Environment, FileSystemLoader
from sqlalchemy.orm import Session
from app.models import Customer, Deployment, DeploymentLog, SystemConfig
from app.services import docker_service, npm_service, port_manager
from app.utils.config import get_system_config
from app.utils.security import encrypt_value, generate_relay_secret
logger = logging.getLogger(__name__)
# Path to Jinja2 templates
TEMPLATE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "templates")
def _get_jinja_env() -> Environment:
"""Create a Jinja2 environment for template rendering."""
return Environment(
loader=FileSystemLoader(TEMPLATE_DIR),
keep_trailing_newline=True,
)
def _log_action(
db: Session, customer_id: int, action: str, status: str, message: str, details: str = ""
) -> None:
"""Write a deployment log entry.
Args:
db: Active session.
customer_id: The customer this log belongs to.
action: Action name (e.g. ``deploy``, ``stop``).
status: ``success``, ``error``, or ``info``.
message: Human-readable message.
details: Additional details (optional).
"""
log = DeploymentLog(
customer_id=customer_id,
action=action,
status=status,
message=message,
details=details,
)
db.add(log)
db.commit()
async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Execute the full deployment workflow for a customer.
Args:
db: Active session.
customer_id: Customer to deploy.
Returns:
Dict with ``success``, ``setup_url``, or ``error``.
"""
customer = db.query(Customer).filter(Customer.id == customer_id).first()
if not customer:
return {"success": False, "error": "Customer not found."}
config = get_system_config(db)
if not config:
return {"success": False, "error": "System not configured. Please set up system settings first."}
# Update status to deploying
customer.status = "deploying"
db.commit()
_log_action(db, customer_id, "deploy", "info", "Deployment started.")
allocated_port = None
instance_dir = None
container_prefix = f"netbird-kunde{customer_id}"
try:
# Step 1: Allocate relay UDP port
allocated_port = port_manager.allocate_port(db, config.relay_base_port)
_log_action(db, customer_id, "deploy", "info", f"Allocated UDP port {allocated_port}.")
# Step 2: Generate relay secret
relay_secret = generate_relay_secret()
# Step 3: Create instance directory
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
os.makedirs(instance_dir, exist_ok=True)
os.makedirs(os.path.join(instance_dir, "data", "management"), exist_ok=True)
os.makedirs(os.path.join(instance_dir, "data", "signal"), exist_ok=True)
_log_action(db, customer_id, "deploy", "info", f"Created directory {instance_dir}.")
# Step 4: Render templates
jinja_env = _get_jinja_env()
template_vars = {
"customer_id": customer_id,
"subdomain": customer.subdomain,
"base_domain": config.base_domain,
"instance_dir": instance_dir,
"relay_udp_port": allocated_port,
"relay_secret": relay_secret,
"netbird_management_image": config.netbird_management_image,
"netbird_signal_image": config.netbird_signal_image,
"netbird_relay_image": config.netbird_relay_image,
"netbird_dashboard_image": config.netbird_dashboard_image,
"docker_network": config.docker_network,
}
# docker-compose.yml
dc_template = jinja_env.get_template("docker-compose.yml.j2")
dc_content = dc_template.render(**template_vars)
with open(os.path.join(instance_dir, "docker-compose.yml"), "w") as f:
f.write(dc_content)
# management.json
mgmt_template = jinja_env.get_template("management.json.j2")
mgmt_content = mgmt_template.render(**template_vars)
with open(os.path.join(instance_dir, "management.json"), "w") as f:
f.write(mgmt_content)
# relay.env
relay_template = jinja_env.get_template("relay.env.j2")
relay_content = relay_template.render(**template_vars)
with open(os.path.join(instance_dir, "relay.env"), "w") as f:
f.write(relay_content)
_log_action(db, customer_id, "deploy", "info", "Configuration files generated.")
# Step 5: Start Docker containers
docker_service.compose_up(instance_dir, container_prefix)
_log_action(db, customer_id, "deploy", "info", "Docker containers started.")
# Step 6: Wait for containers to be healthy
healthy = docker_service.wait_for_healthy(container_prefix, timeout=60)
if not healthy:
_log_action(
db, customer_id, "deploy", "error",
"Containers did not become healthy within 60 seconds."
)
# Don't fail completely — containers might still come up
# Step 7: Create NPM proxy host
domain = f"{customer.subdomain}.{config.base_domain}"
dashboard_container = f"netbird-kunde{customer_id}-dashboard"
npm_result = await npm_service.create_proxy_host(
api_url=config.npm_api_url,
api_token=config.npm_api_token,
domain=domain,
forward_host=dashboard_container,
forward_port=80,
admin_email=config.admin_email,
subdomain=customer.subdomain,
customer_id=customer_id,
)
npm_proxy_id = npm_result.get("proxy_id")
if npm_result.get("error"):
_log_action(
db, customer_id, "deploy", "error",
f"NPM proxy creation failed: {npm_result['error']}",
)
# Continue — deployment works without NPM, admin can fix later
# Step 8: Create deployment record
setup_url = f"https://{domain}"
deployment = Deployment(
customer_id=customer_id,
container_prefix=container_prefix,
relay_udp_port=allocated_port,
npm_proxy_id=npm_proxy_id,
relay_secret=encrypt_value(relay_secret),
setup_url=setup_url,
deployment_status="running",
deployed_at=datetime.utcnow(),
)
db.add(deployment)
customer.status = "active"
db.commit()
_log_action(db, customer_id, "deploy", "success", f"Deployment complete. URL: {setup_url}")
return {"success": True, "setup_url": setup_url}
except Exception as exc:
logger.exception("Deployment failed for customer %d", customer_id)
# Rollback: stop containers if they were started
try:
docker_service.compose_down(
instance_dir or os.path.join(config.data_dir, f"kunde{customer_id}"),
container_prefix,
remove_volumes=True,
)
except Exception:
pass
# Rollback: remove instance directory
if instance_dir and os.path.isdir(instance_dir):
try:
shutil.rmtree(instance_dir)
except Exception:
pass
customer.status = "error"
db.commit()
_log_action(
db, customer_id, "deploy", "error",
f"Deployment failed: {exc}",
details=str(exc),
)
return {"success": False, "error": str(exc)}
async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Remove all resources for a customer deployment.
Args:
db: Active session.
customer_id: Customer to undeploy.
Returns:
Dict with ``success`` bool.
"""
customer = db.query(Customer).filter(Customer.id == customer_id).first()
if not customer:
return {"success": False, "error": "Customer not found."}
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
if deployment and config:
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
# Stop and remove containers
try:
docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True)
_log_action(db, customer_id, "undeploy", "info", "Containers removed.")
except Exception as exc:
_log_action(db, customer_id, "undeploy", "error", f"Container removal error: {exc}")
# Remove NPM proxy host
if deployment.npm_proxy_id and config.npm_api_token:
try:
await npm_service.delete_proxy_host(
config.npm_api_url, config.npm_api_token, deployment.npm_proxy_id
)
_log_action(db, customer_id, "undeploy", "info", "NPM proxy host removed.")
except Exception as exc:
_log_action(db, customer_id, "undeploy", "error", f"NPM removal error: {exc}")
# Remove instance directory
if os.path.isdir(instance_dir):
try:
shutil.rmtree(instance_dir)
_log_action(db, customer_id, "undeploy", "info", "Instance directory removed.")
except Exception as exc:
_log_action(db, customer_id, "undeploy", "error", f"Directory removal error: {exc}")
# Remove deployment record
db.delete(deployment)
db.commit()
_log_action(db, customer_id, "undeploy", "success", "Undeployment complete.")
return {"success": True}
def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Stop containers for a customer.
Args:
db: Active session.
customer_id: Customer whose containers to stop.
Returns:
Dict with ``success`` bool.
"""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
if not deployment or not config:
return {"success": False, "error": "Deployment or config not found."}
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
ok = docker_service.compose_stop(instance_dir, deployment.container_prefix)
if ok:
deployment.deployment_status = "stopped"
db.commit()
_log_action(db, customer_id, "stop", "success", "Containers stopped.")
else:
_log_action(db, customer_id, "stop", "error", "Failed to stop containers.")
return {"success": ok}
def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Start containers for a customer.
Args:
db: Active session.
customer_id: Customer whose containers to start.
Returns:
Dict with ``success`` bool.
"""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
if not deployment or not config:
return {"success": False, "error": "Deployment or config not found."}
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
ok = docker_service.compose_start(instance_dir, deployment.container_prefix)
if ok:
deployment.deployment_status = "running"
db.commit()
_log_action(db, customer_id, "start", "success", "Containers started.")
else:
_log_action(db, customer_id, "start", "error", "Failed to start containers.")
return {"success": ok}
def restart_customer(db: Session, customer_id: int) -> dict[str, Any]:
"""Restart containers for a customer.
Args:
db: Active session.
customer_id: Customer whose containers to restart.
Returns:
Dict with ``success`` bool.
"""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
config = get_system_config(db)
if not deployment or not config:
return {"success": False, "error": "Deployment or config not found."}
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
ok = docker_service.compose_restart(instance_dir, deployment.container_prefix)
if ok:
deployment.deployment_status = "running"
db.commit()
_log_action(db, customer_id, "restart", "success", "Containers restarted.")
else:
_log_action(db, customer_id, "restart", "error", "Failed to restart containers.")
return {"success": ok}
def get_customer_health(db: Session, customer_id: int) -> dict[str, Any]:
"""Check health of a customer's deployment.
Args:
db: Active session.
customer_id: Customer ID.
Returns:
Dict with container statuses and overall health.
"""
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
if not deployment:
return {"healthy": False, "error": "No deployment found.", "containers": []}
containers = docker_service.get_container_status(deployment.container_prefix)
all_running = all(c["status"] == "running" for c in containers) if containers else False
# Update last health check time
deployment.last_health_check = datetime.utcnow()
if all_running:
deployment.deployment_status = "running"
elif containers:
deployment.deployment_status = "failed"
db.commit()
return {
"healthy": all_running,
"containers": containers,
"deployment_status": deployment.deployment_status,
"last_check": deployment.last_health_check.isoformat(),
}

234
app/services/npm_service.py Normal file
View File

@@ -0,0 +1,234 @@
"""Nginx Proxy Manager API integration.
Creates, updates, and deletes proxy host entries so each customer's NetBird
dashboard is accessible at ``{subdomain}.{base_domain}`` with automatic
Let's Encrypt SSL certificates.
"""
import logging
from typing import Any, Optional
import httpx
logger = logging.getLogger(__name__)
# Timeout for NPM API calls (seconds)
NPM_TIMEOUT = 30
async def test_npm_connection(api_url: str, api_token: str) -> dict[str, Any]:
"""Test connectivity to the Nginx Proxy Manager API.
Args:
api_url: NPM API base URL (e.g. ``http://npm:81/api``).
api_token: Bearer token for authentication.
Returns:
Dict with ``ok`` (bool) and ``message`` (str).
"""
headers = {"Authorization": f"Bearer {api_token}"}
try:
async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
resp = await client.get(f"{api_url}/nginx/proxy-hosts", headers=headers)
if resp.status_code == 200:
count = len(resp.json())
return {"ok": True, "message": f"Connected. {count} proxy hosts found."}
return {
"ok": False,
"message": f"NPM returned status {resp.status_code}: {resp.text[:200]}",
}
except httpx.ConnectError:
return {"ok": False, "message": "Connection refused. Is NPM running?"}
except httpx.TimeoutException:
return {"ok": False, "message": "Connection timed out."}
except Exception as exc:
return {"ok": False, "message": f"Unexpected error: {exc}"}
async def create_proxy_host(
api_url: str,
api_token: str,
domain: str,
forward_host: str,
forward_port: int = 80,
admin_email: str = "",
subdomain: str = "",
customer_id: int = 0,
) -> dict[str, Any]:
"""Create a proxy host entry in NPM with SSL for a customer.
The proxy routes traffic as follows:
- ``/`` -> dashboard container (port 80)
- ``/api`` -> management container (port 80)
- ``/signalexchange.*`` -> signal container (port 80)
- ``/relay`` -> relay container (port 80)
Args:
api_url: NPM API base URL.
api_token: Bearer token.
domain: Full domain (e.g. ``kunde1.example.com``).
forward_host: Container name for the dashboard.
forward_port: Port to forward to (default 80).
admin_email: Email for Let's Encrypt.
subdomain: Customer subdomain for building container names.
customer_id: Customer ID for building container names.
Returns:
Dict with ``proxy_id`` on success or ``error`` on failure.
"""
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
# Build advanced Nginx config to route sub-paths to different containers
mgmt_container = f"netbird-kunde{customer_id}-management"
signal_container = f"netbird-kunde{customer_id}-signal"
relay_container = f"netbird-kunde{customer_id}-relay"
advanced_config = f"""
# NetBird Management API
location /api {{
proxy_pass http://{mgmt_container}:80;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}}
# NetBird Signal (gRPC-Web)
location /signalexchange. {{
grpc_pass grpc://{signal_container}:80;
grpc_set_header Host $host;
}}
# NetBird Relay (WebSocket)
location /relay {{
proxy_pass http://{relay_container}:80;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}}
"""
payload = {
"domain_names": [domain],
"forward_scheme": "http",
"forward_host": forward_host,
"forward_port": forward_port,
"certificate_id": 0,
"ssl_forced": True,
"hsts_enabled": True,
"hsts_subdomains": False,
"http2_support": True,
"block_exploits": True,
"allow_websocket_upgrade": True,
"access_list_id": 0,
"advanced_config": advanced_config.strip(),
"meta": {
"letsencrypt_agree": True,
"letsencrypt_email": admin_email,
"dns_challenge": False,
},
}
try:
async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
resp = await client.post(
f"{api_url}/nginx/proxy-hosts", json=payload, headers=headers
)
if resp.status_code in (200, 201):
data = resp.json()
proxy_id = data.get("id")
logger.info("Created NPM proxy host %s (id=%s)", domain, proxy_id)
# Request SSL certificate
await _request_ssl(client, api_url, headers, proxy_id, domain, admin_email)
return {"proxy_id": proxy_id}
else:
error_msg = f"NPM returned {resp.status_code}: {resp.text[:300]}"
logger.error("Failed to create proxy host: %s", error_msg)
return {"error": error_msg}
except Exception as exc:
logger.error("NPM API error: %s", exc)
return {"error": str(exc)}
async def _request_ssl(
client: httpx.AsyncClient,
api_url: str,
headers: dict,
proxy_id: int,
domain: str,
admin_email: str,
) -> None:
"""Request a Let's Encrypt SSL certificate for a proxy host.
Args:
client: httpx client.
api_url: NPM API base URL.
headers: Auth headers.
proxy_id: The proxy host ID.
domain: The domain to certify.
admin_email: Contact email for LE.
"""
ssl_payload = {
"domain_names": [domain],
"meta": {
"letsencrypt_agree": True,
"letsencrypt_email": admin_email,
"dns_challenge": False,
},
}
try:
resp = await client.post(
f"{api_url}/nginx/certificates", json=ssl_payload, headers=headers
)
if resp.status_code in (200, 201):
cert_id = resp.json().get("id")
# Assign certificate to proxy host
await client.put(
f"{api_url}/nginx/proxy-hosts/{proxy_id}",
json={"certificate_id": cert_id},
headers=headers,
)
logger.info("SSL certificate assigned to proxy host %s", proxy_id)
else:
logger.warning("SSL request returned %s: %s", resp.status_code, resp.text[:200])
except Exception as exc:
logger.warning("SSL certificate request failed: %s", exc)
async def delete_proxy_host(
api_url: str, api_token: str, proxy_id: int
) -> bool:
"""Delete a proxy host from NPM.
Args:
api_url: NPM API base URL.
api_token: Bearer token.
proxy_id: The proxy host ID to delete.
Returns:
True on success.
"""
headers = {"Authorization": f"Bearer {api_token}"}
try:
async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
resp = await client.delete(
f"{api_url}/nginx/proxy-hosts/{proxy_id}", headers=headers
)
if resp.status_code in (200, 204):
logger.info("Deleted NPM proxy host %d", proxy_id)
return True
logger.warning(
"Failed to delete proxy host %d: %s %s",
proxy_id, resp.status_code, resp.text[:200],
)
return False
except Exception as exc:
logger.error("NPM delete error: %s", exc)
return False

View File

@@ -0,0 +1,110 @@
"""UDP port allocation service for NetBird relay/STUN ports.
Manages the range starting at relay_base_port (default 3478). Each customer
gets one unique UDP port. The manager checks both the database and the OS
to avoid collisions.
"""
import logging
import socket
from typing import Optional
from sqlalchemy.orm import Session
from app.models import Deployment
logger = logging.getLogger(__name__)
def _is_udp_port_in_use(port: int) -> bool:
"""Check whether a UDP port is currently bound on the host.
Args:
port: UDP port number to probe.
Returns:
True if the port is in use.
"""
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
sock.bind(("0.0.0.0", port))
return False
except OSError:
return True
finally:
sock.close()
def get_allocated_ports(db: Session) -> set[int]:
"""Return the set of relay UDP ports already assigned in the database.
Args:
db: Active SQLAlchemy session.
Returns:
Set of port numbers.
"""
rows = db.query(Deployment.relay_udp_port).all()
return {r[0] for r in rows}
def allocate_port(db: Session, base_port: int = 3478, max_ports: int = 100) -> int:
"""Find and return the next available relay UDP port.
Scans from *base_port* to *base_port + max_ports - 1*, skipping ports
that are either already in the database or currently bound on the host.
Args:
db: Active SQLAlchemy session.
base_port: Start of the port range.
max_ports: Number of ports in the range.
Returns:
An available port number.
Raises:
RuntimeError: If no port in the range is available.
"""
allocated = get_allocated_ports(db)
for port in range(base_port, base_port + max_ports):
if port in allocated:
continue
if _is_udp_port_in_use(port):
logger.warning("Port %d is in use on the host, skipping.", port)
continue
logger.info("Allocated relay UDP port %d.", port)
return port
raise RuntimeError(
f"No available relay ports in range {base_port}-{base_port + max_ports - 1}. "
"All 100 ports are allocated."
)
def release_port(db: Session, port: int) -> None:
"""Mark a port as released (informational logging only).
The actual release happens when the Deployment row is deleted. This
helper exists for explicit logging in rollback scenarios.
Args:
db: Active SQLAlchemy session.
port: The port to release.
"""
logger.info("Released relay UDP port %d.", port)
def validate_port_available(db: Session, port: int) -> bool:
"""Check if a specific port is available both in DB and on the host.
Args:
db: Active SQLAlchemy session.
port: Port number to check.
Returns:
True if the port is available.
"""
allocated = get_allocated_ports(db)
if port in allocated:
return False
return not _is_udp_port_in_use(port)