First Build alpha 0.1
This commit is contained in:
0
app/services/__init__.py
Normal file
0
app/services/__init__.py
Normal file
334
app/services/docker_service.py
Normal file
334
app/services/docker_service.py
Normal file
@@ -0,0 +1,334 @@
|
||||
"""Docker container management via the Python Docker SDK.
|
||||
|
||||
Responsible for creating, starting, stopping, restarting, and removing
|
||||
per-customer Docker Compose stacks. Also provides log retrieval and
|
||||
container health/status information.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
import docker
|
||||
from docker.errors import DockerException, NotFound
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_client() -> docker.DockerClient:
|
||||
"""Return a Docker client connected via the Unix socket.
|
||||
|
||||
Returns:
|
||||
docker.DockerClient instance.
|
||||
"""
|
||||
return docker.from_env()
|
||||
|
||||
|
||||
def compose_up(instance_dir: str, project_name: str) -> bool:
|
||||
"""Run ``docker compose up -d`` for a customer instance.
|
||||
|
||||
Args:
|
||||
instance_dir: Absolute path to the customer's instance directory.
|
||||
project_name: Docker Compose project name (e.g. ``netbird-kunde5``).
|
||||
|
||||
Returns:
|
||||
True on success.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If ``docker compose up`` fails.
|
||||
"""
|
||||
compose_file = os.path.join(instance_dir, "docker-compose.yml")
|
||||
if not os.path.isfile(compose_file):
|
||||
raise FileNotFoundError(f"docker-compose.yml not found at {compose_file}")
|
||||
|
||||
cmd = [
|
||||
"docker", "compose",
|
||||
"-f", compose_file,
|
||||
"-p", project_name,
|
||||
"up", "-d", "--remove-orphans",
|
||||
]
|
||||
logger.info("Running: %s", " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.error("docker compose up failed: %s", result.stderr)
|
||||
raise RuntimeError(f"docker compose up failed: {result.stderr}")
|
||||
|
||||
logger.info("docker compose up succeeded for %s", project_name)
|
||||
return True
|
||||
|
||||
|
||||
def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool:
|
||||
"""Run ``docker compose down`` for a customer instance.
|
||||
|
||||
Args:
|
||||
instance_dir: Absolute path to the customer's instance directory.
|
||||
project_name: Docker Compose project name.
|
||||
remove_volumes: Whether to also remove volumes.
|
||||
|
||||
Returns:
|
||||
True on success.
|
||||
"""
|
||||
compose_file = os.path.join(instance_dir, "docker-compose.yml")
|
||||
cmd = [
|
||||
"docker", "compose",
|
||||
"-f", compose_file,
|
||||
"-p", project_name,
|
||||
"down",
|
||||
]
|
||||
if remove_volumes:
|
||||
cmd.append("-v")
|
||||
|
||||
logger.info("Running: %s", " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning("docker compose down returned non-zero: %s", result.stderr)
|
||||
return True
|
||||
|
||||
|
||||
def compose_stop(instance_dir: str, project_name: str) -> bool:
|
||||
"""Run ``docker compose stop`` for a customer instance.
|
||||
|
||||
Args:
|
||||
instance_dir: Absolute path to the customer's instance directory.
|
||||
project_name: Docker Compose project name.
|
||||
|
||||
Returns:
|
||||
True on success.
|
||||
"""
|
||||
compose_file = os.path.join(instance_dir, "docker-compose.yml")
|
||||
cmd = [
|
||||
"docker", "compose",
|
||||
"-f", compose_file,
|
||||
"-p", project_name,
|
||||
"stop",
|
||||
]
|
||||
logger.info("Running: %s", " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def compose_start(instance_dir: str, project_name: str) -> bool:
|
||||
"""Run ``docker compose start`` for a customer instance.
|
||||
|
||||
Args:
|
||||
instance_dir: Absolute path to the customer's instance directory.
|
||||
project_name: Docker Compose project name.
|
||||
|
||||
Returns:
|
||||
True on success.
|
||||
"""
|
||||
compose_file = os.path.join(instance_dir, "docker-compose.yml")
|
||||
cmd = [
|
||||
"docker", "compose",
|
||||
"-f", compose_file,
|
||||
"-p", project_name,
|
||||
"start",
|
||||
]
|
||||
logger.info("Running: %s", " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def compose_restart(instance_dir: str, project_name: str) -> bool:
|
||||
"""Run ``docker compose restart`` for a customer instance.
|
||||
|
||||
Args:
|
||||
instance_dir: Absolute path to the customer's instance directory.
|
||||
project_name: Docker Compose project name.
|
||||
|
||||
Returns:
|
||||
True on success.
|
||||
"""
|
||||
compose_file = os.path.join(instance_dir, "docker-compose.yml")
|
||||
cmd = [
|
||||
"docker", "compose",
|
||||
"-f", compose_file,
|
||||
"-p", project_name,
|
||||
"restart",
|
||||
]
|
||||
logger.info("Running: %s", " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def get_container_status(container_prefix: str) -> list[dict[str, Any]]:
|
||||
"""Get the status of all containers matching a prefix.
|
||||
|
||||
Args:
|
||||
container_prefix: Container name prefix (e.g. ``netbird-kunde5``).
|
||||
|
||||
Returns:
|
||||
List of dicts with container name, status, and health info.
|
||||
"""
|
||||
client = _get_client()
|
||||
results: list[dict[str, Any]] = []
|
||||
try:
|
||||
containers = client.containers.list(all=True, filters={"name": container_prefix})
|
||||
for c in containers:
|
||||
health = "N/A"
|
||||
if c.attrs.get("State", {}).get("Health"):
|
||||
health = c.attrs["State"]["Health"].get("Status", "N/A")
|
||||
results.append({
|
||||
"name": c.name,
|
||||
"status": c.status,
|
||||
"health": health,
|
||||
"image": str(c.image.tags[0]) if c.image.tags else str(c.image.id[:12]),
|
||||
"created": c.attrs.get("Created", ""),
|
||||
})
|
||||
except DockerException as exc:
|
||||
logger.error("Failed to get container status: %s", exc)
|
||||
return results
|
||||
|
||||
|
||||
def get_container_logs(container_name: str, tail: int = 200) -> str:
|
||||
"""Retrieve recent logs from a container.
|
||||
|
||||
Args:
|
||||
container_name: Full container name.
|
||||
tail: Number of log lines to retrieve.
|
||||
|
||||
Returns:
|
||||
Log text.
|
||||
"""
|
||||
client = _get_client()
|
||||
try:
|
||||
container = client.containers.get(container_name)
|
||||
return container.logs(tail=tail, timestamps=True).decode("utf-8", errors="replace")
|
||||
except NotFound:
|
||||
return f"Container {container_name} not found."
|
||||
except DockerException as exc:
|
||||
return f"Error retrieving logs: {exc}"
|
||||
|
||||
|
||||
def get_all_container_logs(container_prefix: str, tail: int = 100) -> dict[str, str]:
|
||||
"""Get logs for all containers matching a prefix.
|
||||
|
||||
Args:
|
||||
container_prefix: Container name prefix.
|
||||
tail: Lines per container.
|
||||
|
||||
Returns:
|
||||
Dict mapping container name to log text.
|
||||
"""
|
||||
client = _get_client()
|
||||
logs: dict[str, str] = {}
|
||||
try:
|
||||
containers = client.containers.list(all=True, filters={"name": container_prefix})
|
||||
for c in containers:
|
||||
try:
|
||||
logs[c.name] = c.logs(tail=tail, timestamps=True).decode(
|
||||
"utf-8", errors="replace"
|
||||
)
|
||||
except DockerException:
|
||||
logs[c.name] = "Error retrieving logs."
|
||||
except DockerException as exc:
|
||||
logger.error("Failed to list containers: %s", exc)
|
||||
return logs
|
||||
|
||||
|
||||
def wait_for_healthy(container_prefix: str, timeout: int = 60) -> bool:
|
||||
"""Wait until all containers with the given prefix are running.
|
||||
|
||||
Args:
|
||||
container_prefix: Container name prefix.
|
||||
timeout: Maximum seconds to wait.
|
||||
|
||||
Returns:
|
||||
True if all containers started within timeout.
|
||||
"""
|
||||
client = _get_client()
|
||||
deadline = time.time() + timeout
|
||||
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
containers = client.containers.list(
|
||||
all=True, filters={"name": container_prefix}
|
||||
)
|
||||
if not containers:
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
all_running = all(c.status == "running" for c in containers)
|
||||
if all_running:
|
||||
logger.info("All containers for %s are running.", container_prefix)
|
||||
return True
|
||||
except DockerException as exc:
|
||||
logger.warning("Health check error: %s", exc)
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
logger.warning("Timeout waiting for %s containers to start.", container_prefix)
|
||||
return False
|
||||
|
||||
|
||||
def get_docker_stats(container_prefix: str) -> list[dict[str, Any]]:
|
||||
"""Retrieve resource usage stats for containers matching a prefix.
|
||||
|
||||
Args:
|
||||
container_prefix: Container name prefix.
|
||||
|
||||
Returns:
|
||||
List of dicts with CPU, memory, and network stats.
|
||||
"""
|
||||
client = _get_client()
|
||||
stats_list: list[dict[str, Any]] = []
|
||||
try:
|
||||
containers = client.containers.list(filters={"name": container_prefix})
|
||||
for c in containers:
|
||||
try:
|
||||
raw = c.stats(stream=False)
|
||||
cpu_delta = (
|
||||
raw.get("cpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
|
||||
- raw.get("precpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
|
||||
)
|
||||
system_delta = (
|
||||
raw.get("cpu_stats", {}).get("system_cpu_usage", 0)
|
||||
- raw.get("precpu_stats", {}).get("system_cpu_usage", 0)
|
||||
)
|
||||
num_cpus = len(
|
||||
raw.get("cpu_stats", {}).get("cpu_usage", {}).get("percpu_usage", [1])
|
||||
)
|
||||
cpu_pct = 0.0
|
||||
if system_delta > 0:
|
||||
cpu_pct = (cpu_delta / system_delta) * num_cpus * 100
|
||||
|
||||
mem_usage = raw.get("memory_stats", {}).get("usage", 0)
|
||||
mem_limit = raw.get("memory_stats", {}).get("limit", 1)
|
||||
|
||||
stats_list.append({
|
||||
"name": c.name,
|
||||
"cpu_percent": round(cpu_pct, 2),
|
||||
"memory_usage_mb": round(mem_usage / 1024 / 1024, 1),
|
||||
"memory_limit_mb": round(mem_limit / 1024 / 1024, 1),
|
||||
"memory_percent": round((mem_usage / mem_limit) * 100, 1) if mem_limit else 0,
|
||||
})
|
||||
except DockerException:
|
||||
stats_list.append({"name": c.name, "error": "Failed to get stats"})
|
||||
except DockerException as exc:
|
||||
logger.error("Failed to get docker stats: %s", exc)
|
||||
return stats_list
|
||||
|
||||
|
||||
def remove_instance_containers(container_prefix: str) -> bool:
|
||||
"""Force-remove all containers matching a prefix.
|
||||
|
||||
Args:
|
||||
container_prefix: Container name prefix.
|
||||
|
||||
Returns:
|
||||
True if removal succeeded.
|
||||
"""
|
||||
client = _get_client()
|
||||
try:
|
||||
containers = client.containers.list(all=True, filters={"name": container_prefix})
|
||||
for c in containers:
|
||||
logger.info("Removing container %s", c.name)
|
||||
c.remove(force=True)
|
||||
return True
|
||||
except DockerException as exc:
|
||||
logger.error("Failed to remove containers: %s", exc)
|
||||
return False
|
||||
396
app/services/netbird_service.py
Normal file
396
app/services/netbird_service.py
Normal file
@@ -0,0 +1,396 @@
|
||||
"""NetBird deployment orchestration service.
|
||||
|
||||
Coordinates the full customer deployment lifecycle:
|
||||
1. Validate inputs
|
||||
2. Allocate ports
|
||||
3. Generate configs from Jinja2 templates
|
||||
4. Create instance directory and write files
|
||||
5. Start Docker containers
|
||||
6. Wait for health checks
|
||||
7. Create NPM proxy hosts
|
||||
8. Update database
|
||||
|
||||
Includes comprehensive rollback on failure.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import Customer, Deployment, DeploymentLog, SystemConfig
|
||||
from app.services import docker_service, npm_service, port_manager
|
||||
from app.utils.config import get_system_config
|
||||
from app.utils.security import encrypt_value, generate_relay_secret
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Path to Jinja2 templates
|
||||
TEMPLATE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "templates")
|
||||
|
||||
|
||||
def _get_jinja_env() -> Environment:
|
||||
"""Create a Jinja2 environment for template rendering."""
|
||||
return Environment(
|
||||
loader=FileSystemLoader(TEMPLATE_DIR),
|
||||
keep_trailing_newline=True,
|
||||
)
|
||||
|
||||
|
||||
def _log_action(
|
||||
db: Session, customer_id: int, action: str, status: str, message: str, details: str = ""
|
||||
) -> None:
|
||||
"""Write a deployment log entry.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: The customer this log belongs to.
|
||||
action: Action name (e.g. ``deploy``, ``stop``).
|
||||
status: ``success``, ``error``, or ``info``.
|
||||
message: Human-readable message.
|
||||
details: Additional details (optional).
|
||||
"""
|
||||
log = DeploymentLog(
|
||||
customer_id=customer_id,
|
||||
action=action,
|
||||
status=status,
|
||||
message=message,
|
||||
details=details,
|
||||
)
|
||||
db.add(log)
|
||||
db.commit()
|
||||
|
||||
|
||||
async def deploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
|
||||
"""Execute the full deployment workflow for a customer.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: Customer to deploy.
|
||||
|
||||
Returns:
|
||||
Dict with ``success``, ``setup_url``, or ``error``.
|
||||
"""
|
||||
customer = db.query(Customer).filter(Customer.id == customer_id).first()
|
||||
if not customer:
|
||||
return {"success": False, "error": "Customer not found."}
|
||||
|
||||
config = get_system_config(db)
|
||||
if not config:
|
||||
return {"success": False, "error": "System not configured. Please set up system settings first."}
|
||||
|
||||
# Update status to deploying
|
||||
customer.status = "deploying"
|
||||
db.commit()
|
||||
|
||||
_log_action(db, customer_id, "deploy", "info", "Deployment started.")
|
||||
|
||||
allocated_port = None
|
||||
instance_dir = None
|
||||
container_prefix = f"netbird-kunde{customer_id}"
|
||||
|
||||
try:
|
||||
# Step 1: Allocate relay UDP port
|
||||
allocated_port = port_manager.allocate_port(db, config.relay_base_port)
|
||||
_log_action(db, customer_id, "deploy", "info", f"Allocated UDP port {allocated_port}.")
|
||||
|
||||
# Step 2: Generate relay secret
|
||||
relay_secret = generate_relay_secret()
|
||||
|
||||
# Step 3: Create instance directory
|
||||
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
|
||||
os.makedirs(instance_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(instance_dir, "data", "management"), exist_ok=True)
|
||||
os.makedirs(os.path.join(instance_dir, "data", "signal"), exist_ok=True)
|
||||
_log_action(db, customer_id, "deploy", "info", f"Created directory {instance_dir}.")
|
||||
|
||||
# Step 4: Render templates
|
||||
jinja_env = _get_jinja_env()
|
||||
template_vars = {
|
||||
"customer_id": customer_id,
|
||||
"subdomain": customer.subdomain,
|
||||
"base_domain": config.base_domain,
|
||||
"instance_dir": instance_dir,
|
||||
"relay_udp_port": allocated_port,
|
||||
"relay_secret": relay_secret,
|
||||
"netbird_management_image": config.netbird_management_image,
|
||||
"netbird_signal_image": config.netbird_signal_image,
|
||||
"netbird_relay_image": config.netbird_relay_image,
|
||||
"netbird_dashboard_image": config.netbird_dashboard_image,
|
||||
"docker_network": config.docker_network,
|
||||
}
|
||||
|
||||
# docker-compose.yml
|
||||
dc_template = jinja_env.get_template("docker-compose.yml.j2")
|
||||
dc_content = dc_template.render(**template_vars)
|
||||
with open(os.path.join(instance_dir, "docker-compose.yml"), "w") as f:
|
||||
f.write(dc_content)
|
||||
|
||||
# management.json
|
||||
mgmt_template = jinja_env.get_template("management.json.j2")
|
||||
mgmt_content = mgmt_template.render(**template_vars)
|
||||
with open(os.path.join(instance_dir, "management.json"), "w") as f:
|
||||
f.write(mgmt_content)
|
||||
|
||||
# relay.env
|
||||
relay_template = jinja_env.get_template("relay.env.j2")
|
||||
relay_content = relay_template.render(**template_vars)
|
||||
with open(os.path.join(instance_dir, "relay.env"), "w") as f:
|
||||
f.write(relay_content)
|
||||
|
||||
_log_action(db, customer_id, "deploy", "info", "Configuration files generated.")
|
||||
|
||||
# Step 5: Start Docker containers
|
||||
docker_service.compose_up(instance_dir, container_prefix)
|
||||
_log_action(db, customer_id, "deploy", "info", "Docker containers started.")
|
||||
|
||||
# Step 6: Wait for containers to be healthy
|
||||
healthy = docker_service.wait_for_healthy(container_prefix, timeout=60)
|
||||
if not healthy:
|
||||
_log_action(
|
||||
db, customer_id, "deploy", "error",
|
||||
"Containers did not become healthy within 60 seconds."
|
||||
)
|
||||
# Don't fail completely — containers might still come up
|
||||
|
||||
# Step 7: Create NPM proxy host
|
||||
domain = f"{customer.subdomain}.{config.base_domain}"
|
||||
dashboard_container = f"netbird-kunde{customer_id}-dashboard"
|
||||
npm_result = await npm_service.create_proxy_host(
|
||||
api_url=config.npm_api_url,
|
||||
api_token=config.npm_api_token,
|
||||
domain=domain,
|
||||
forward_host=dashboard_container,
|
||||
forward_port=80,
|
||||
admin_email=config.admin_email,
|
||||
subdomain=customer.subdomain,
|
||||
customer_id=customer_id,
|
||||
)
|
||||
|
||||
npm_proxy_id = npm_result.get("proxy_id")
|
||||
if npm_result.get("error"):
|
||||
_log_action(
|
||||
db, customer_id, "deploy", "error",
|
||||
f"NPM proxy creation failed: {npm_result['error']}",
|
||||
)
|
||||
# Continue — deployment works without NPM, admin can fix later
|
||||
|
||||
# Step 8: Create deployment record
|
||||
setup_url = f"https://{domain}"
|
||||
deployment = Deployment(
|
||||
customer_id=customer_id,
|
||||
container_prefix=container_prefix,
|
||||
relay_udp_port=allocated_port,
|
||||
npm_proxy_id=npm_proxy_id,
|
||||
relay_secret=encrypt_value(relay_secret),
|
||||
setup_url=setup_url,
|
||||
deployment_status="running",
|
||||
deployed_at=datetime.utcnow(),
|
||||
)
|
||||
db.add(deployment)
|
||||
|
||||
customer.status = "active"
|
||||
db.commit()
|
||||
|
||||
_log_action(db, customer_id, "deploy", "success", f"Deployment complete. URL: {setup_url}")
|
||||
|
||||
return {"success": True, "setup_url": setup_url}
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Deployment failed for customer %d", customer_id)
|
||||
|
||||
# Rollback: stop containers if they were started
|
||||
try:
|
||||
docker_service.compose_down(
|
||||
instance_dir or os.path.join(config.data_dir, f"kunde{customer_id}"),
|
||||
container_prefix,
|
||||
remove_volumes=True,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Rollback: remove instance directory
|
||||
if instance_dir and os.path.isdir(instance_dir):
|
||||
try:
|
||||
shutil.rmtree(instance_dir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
customer.status = "error"
|
||||
db.commit()
|
||||
|
||||
_log_action(
|
||||
db, customer_id, "deploy", "error",
|
||||
f"Deployment failed: {exc}",
|
||||
details=str(exc),
|
||||
)
|
||||
|
||||
return {"success": False, "error": str(exc)}
|
||||
|
||||
|
||||
async def undeploy_customer(db: Session, customer_id: int) -> dict[str, Any]:
|
||||
"""Remove all resources for a customer deployment.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: Customer to undeploy.
|
||||
|
||||
Returns:
|
||||
Dict with ``success`` bool.
|
||||
"""
|
||||
customer = db.query(Customer).filter(Customer.id == customer_id).first()
|
||||
if not customer:
|
||||
return {"success": False, "error": "Customer not found."}
|
||||
|
||||
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
|
||||
config = get_system_config(db)
|
||||
|
||||
if deployment and config:
|
||||
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
|
||||
|
||||
# Stop and remove containers
|
||||
try:
|
||||
docker_service.compose_down(instance_dir, deployment.container_prefix, remove_volumes=True)
|
||||
_log_action(db, customer_id, "undeploy", "info", "Containers removed.")
|
||||
except Exception as exc:
|
||||
_log_action(db, customer_id, "undeploy", "error", f"Container removal error: {exc}")
|
||||
|
||||
# Remove NPM proxy host
|
||||
if deployment.npm_proxy_id and config.npm_api_token:
|
||||
try:
|
||||
await npm_service.delete_proxy_host(
|
||||
config.npm_api_url, config.npm_api_token, deployment.npm_proxy_id
|
||||
)
|
||||
_log_action(db, customer_id, "undeploy", "info", "NPM proxy host removed.")
|
||||
except Exception as exc:
|
||||
_log_action(db, customer_id, "undeploy", "error", f"NPM removal error: {exc}")
|
||||
|
||||
# Remove instance directory
|
||||
if os.path.isdir(instance_dir):
|
||||
try:
|
||||
shutil.rmtree(instance_dir)
|
||||
_log_action(db, customer_id, "undeploy", "info", "Instance directory removed.")
|
||||
except Exception as exc:
|
||||
_log_action(db, customer_id, "undeploy", "error", f"Directory removal error: {exc}")
|
||||
|
||||
# Remove deployment record
|
||||
db.delete(deployment)
|
||||
db.commit()
|
||||
|
||||
_log_action(db, customer_id, "undeploy", "success", "Undeployment complete.")
|
||||
return {"success": True}
|
||||
|
||||
|
||||
def stop_customer(db: Session, customer_id: int) -> dict[str, Any]:
|
||||
"""Stop containers for a customer.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: Customer whose containers to stop.
|
||||
|
||||
Returns:
|
||||
Dict with ``success`` bool.
|
||||
"""
|
||||
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
|
||||
config = get_system_config(db)
|
||||
if not deployment or not config:
|
||||
return {"success": False, "error": "Deployment or config not found."}
|
||||
|
||||
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
|
||||
ok = docker_service.compose_stop(instance_dir, deployment.container_prefix)
|
||||
if ok:
|
||||
deployment.deployment_status = "stopped"
|
||||
db.commit()
|
||||
_log_action(db, customer_id, "stop", "success", "Containers stopped.")
|
||||
else:
|
||||
_log_action(db, customer_id, "stop", "error", "Failed to stop containers.")
|
||||
return {"success": ok}
|
||||
|
||||
|
||||
def start_customer(db: Session, customer_id: int) -> dict[str, Any]:
|
||||
"""Start containers for a customer.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: Customer whose containers to start.
|
||||
|
||||
Returns:
|
||||
Dict with ``success`` bool.
|
||||
"""
|
||||
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
|
||||
config = get_system_config(db)
|
||||
if not deployment or not config:
|
||||
return {"success": False, "error": "Deployment or config not found."}
|
||||
|
||||
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
|
||||
ok = docker_service.compose_start(instance_dir, deployment.container_prefix)
|
||||
if ok:
|
||||
deployment.deployment_status = "running"
|
||||
db.commit()
|
||||
_log_action(db, customer_id, "start", "success", "Containers started.")
|
||||
else:
|
||||
_log_action(db, customer_id, "start", "error", "Failed to start containers.")
|
||||
return {"success": ok}
|
||||
|
||||
|
||||
def restart_customer(db: Session, customer_id: int) -> dict[str, Any]:
|
||||
"""Restart containers for a customer.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: Customer whose containers to restart.
|
||||
|
||||
Returns:
|
||||
Dict with ``success`` bool.
|
||||
"""
|
||||
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
|
||||
config = get_system_config(db)
|
||||
if not deployment or not config:
|
||||
return {"success": False, "error": "Deployment or config not found."}
|
||||
|
||||
instance_dir = os.path.join(config.data_dir, f"kunde{customer_id}")
|
||||
ok = docker_service.compose_restart(instance_dir, deployment.container_prefix)
|
||||
if ok:
|
||||
deployment.deployment_status = "running"
|
||||
db.commit()
|
||||
_log_action(db, customer_id, "restart", "success", "Containers restarted.")
|
||||
else:
|
||||
_log_action(db, customer_id, "restart", "error", "Failed to restart containers.")
|
||||
return {"success": ok}
|
||||
|
||||
|
||||
def get_customer_health(db: Session, customer_id: int) -> dict[str, Any]:
|
||||
"""Check health of a customer's deployment.
|
||||
|
||||
Args:
|
||||
db: Active session.
|
||||
customer_id: Customer ID.
|
||||
|
||||
Returns:
|
||||
Dict with container statuses and overall health.
|
||||
"""
|
||||
deployment = db.query(Deployment).filter(Deployment.customer_id == customer_id).first()
|
||||
if not deployment:
|
||||
return {"healthy": False, "error": "No deployment found.", "containers": []}
|
||||
|
||||
containers = docker_service.get_container_status(deployment.container_prefix)
|
||||
all_running = all(c["status"] == "running" for c in containers) if containers else False
|
||||
|
||||
# Update last health check time
|
||||
deployment.last_health_check = datetime.utcnow()
|
||||
if all_running:
|
||||
deployment.deployment_status = "running"
|
||||
elif containers:
|
||||
deployment.deployment_status = "failed"
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"healthy": all_running,
|
||||
"containers": containers,
|
||||
"deployment_status": deployment.deployment_status,
|
||||
"last_check": deployment.last_health_check.isoformat(),
|
||||
}
|
||||
234
app/services/npm_service.py
Normal file
234
app/services/npm_service.py
Normal file
@@ -0,0 +1,234 @@
|
||||
"""Nginx Proxy Manager API integration.
|
||||
|
||||
Creates, updates, and deletes proxy host entries so each customer's NetBird
|
||||
dashboard is accessible at ``{subdomain}.{base_domain}`` with automatic
|
||||
Let's Encrypt SSL certificates.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Timeout for NPM API calls (seconds)
|
||||
NPM_TIMEOUT = 30
|
||||
|
||||
|
||||
async def test_npm_connection(api_url: str, api_token: str) -> dict[str, Any]:
|
||||
"""Test connectivity to the Nginx Proxy Manager API.
|
||||
|
||||
Args:
|
||||
api_url: NPM API base URL (e.g. ``http://npm:81/api``).
|
||||
api_token: Bearer token for authentication.
|
||||
|
||||
Returns:
|
||||
Dict with ``ok`` (bool) and ``message`` (str).
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {api_token}"}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
|
||||
resp = await client.get(f"{api_url}/nginx/proxy-hosts", headers=headers)
|
||||
if resp.status_code == 200:
|
||||
count = len(resp.json())
|
||||
return {"ok": True, "message": f"Connected. {count} proxy hosts found."}
|
||||
return {
|
||||
"ok": False,
|
||||
"message": f"NPM returned status {resp.status_code}: {resp.text[:200]}",
|
||||
}
|
||||
except httpx.ConnectError:
|
||||
return {"ok": False, "message": "Connection refused. Is NPM running?"}
|
||||
except httpx.TimeoutException:
|
||||
return {"ok": False, "message": "Connection timed out."}
|
||||
except Exception as exc:
|
||||
return {"ok": False, "message": f"Unexpected error: {exc}"}
|
||||
|
||||
|
||||
async def create_proxy_host(
|
||||
api_url: str,
|
||||
api_token: str,
|
||||
domain: str,
|
||||
forward_host: str,
|
||||
forward_port: int = 80,
|
||||
admin_email: str = "",
|
||||
subdomain: str = "",
|
||||
customer_id: int = 0,
|
||||
) -> dict[str, Any]:
|
||||
"""Create a proxy host entry in NPM with SSL for a customer.
|
||||
|
||||
The proxy routes traffic as follows:
|
||||
- ``/`` -> dashboard container (port 80)
|
||||
- ``/api`` -> management container (port 80)
|
||||
- ``/signalexchange.*`` -> signal container (port 80)
|
||||
- ``/relay`` -> relay container (port 80)
|
||||
|
||||
Args:
|
||||
api_url: NPM API base URL.
|
||||
api_token: Bearer token.
|
||||
domain: Full domain (e.g. ``kunde1.example.com``).
|
||||
forward_host: Container name for the dashboard.
|
||||
forward_port: Port to forward to (default 80).
|
||||
admin_email: Email for Let's Encrypt.
|
||||
subdomain: Customer subdomain for building container names.
|
||||
customer_id: Customer ID for building container names.
|
||||
|
||||
Returns:
|
||||
Dict with ``proxy_id`` on success or ``error`` on failure.
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_token}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# Build advanced Nginx config to route sub-paths to different containers
|
||||
mgmt_container = f"netbird-kunde{customer_id}-management"
|
||||
signal_container = f"netbird-kunde{customer_id}-signal"
|
||||
relay_container = f"netbird-kunde{customer_id}-relay"
|
||||
|
||||
advanced_config = f"""
|
||||
# NetBird Management API
|
||||
location /api {{
|
||||
proxy_pass http://{mgmt_container}:80;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}}
|
||||
|
||||
# NetBird Signal (gRPC-Web)
|
||||
location /signalexchange. {{
|
||||
grpc_pass grpc://{signal_container}:80;
|
||||
grpc_set_header Host $host;
|
||||
}}
|
||||
|
||||
# NetBird Relay (WebSocket)
|
||||
location /relay {{
|
||||
proxy_pass http://{relay_container}:80;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {
|
||||
"domain_names": [domain],
|
||||
"forward_scheme": "http",
|
||||
"forward_host": forward_host,
|
||||
"forward_port": forward_port,
|
||||
"certificate_id": 0,
|
||||
"ssl_forced": True,
|
||||
"hsts_enabled": True,
|
||||
"hsts_subdomains": False,
|
||||
"http2_support": True,
|
||||
"block_exploits": True,
|
||||
"allow_websocket_upgrade": True,
|
||||
"access_list_id": 0,
|
||||
"advanced_config": advanced_config.strip(),
|
||||
"meta": {
|
||||
"letsencrypt_agree": True,
|
||||
"letsencrypt_email": admin_email,
|
||||
"dns_challenge": False,
|
||||
},
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
|
||||
resp = await client.post(
|
||||
f"{api_url}/nginx/proxy-hosts", json=payload, headers=headers
|
||||
)
|
||||
if resp.status_code in (200, 201):
|
||||
data = resp.json()
|
||||
proxy_id = data.get("id")
|
||||
logger.info("Created NPM proxy host %s (id=%s)", domain, proxy_id)
|
||||
|
||||
# Request SSL certificate
|
||||
await _request_ssl(client, api_url, headers, proxy_id, domain, admin_email)
|
||||
|
||||
return {"proxy_id": proxy_id}
|
||||
else:
|
||||
error_msg = f"NPM returned {resp.status_code}: {resp.text[:300]}"
|
||||
logger.error("Failed to create proxy host: %s", error_msg)
|
||||
return {"error": error_msg}
|
||||
except Exception as exc:
|
||||
logger.error("NPM API error: %s", exc)
|
||||
return {"error": str(exc)}
|
||||
|
||||
|
||||
async def _request_ssl(
|
||||
client: httpx.AsyncClient,
|
||||
api_url: str,
|
||||
headers: dict,
|
||||
proxy_id: int,
|
||||
domain: str,
|
||||
admin_email: str,
|
||||
) -> None:
|
||||
"""Request a Let's Encrypt SSL certificate for a proxy host.
|
||||
|
||||
Args:
|
||||
client: httpx client.
|
||||
api_url: NPM API base URL.
|
||||
headers: Auth headers.
|
||||
proxy_id: The proxy host ID.
|
||||
domain: The domain to certify.
|
||||
admin_email: Contact email for LE.
|
||||
"""
|
||||
ssl_payload = {
|
||||
"domain_names": [domain],
|
||||
"meta": {
|
||||
"letsencrypt_agree": True,
|
||||
"letsencrypt_email": admin_email,
|
||||
"dns_challenge": False,
|
||||
},
|
||||
}
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{api_url}/nginx/certificates", json=ssl_payload, headers=headers
|
||||
)
|
||||
if resp.status_code in (200, 201):
|
||||
cert_id = resp.json().get("id")
|
||||
# Assign certificate to proxy host
|
||||
await client.put(
|
||||
f"{api_url}/nginx/proxy-hosts/{proxy_id}",
|
||||
json={"certificate_id": cert_id},
|
||||
headers=headers,
|
||||
)
|
||||
logger.info("SSL certificate assigned to proxy host %s", proxy_id)
|
||||
else:
|
||||
logger.warning("SSL request returned %s: %s", resp.status_code, resp.text[:200])
|
||||
except Exception as exc:
|
||||
logger.warning("SSL certificate request failed: %s", exc)
|
||||
|
||||
|
||||
async def delete_proxy_host(
|
||||
api_url: str, api_token: str, proxy_id: int
|
||||
) -> bool:
|
||||
"""Delete a proxy host from NPM.
|
||||
|
||||
Args:
|
||||
api_url: NPM API base URL.
|
||||
api_token: Bearer token.
|
||||
proxy_id: The proxy host ID to delete.
|
||||
|
||||
Returns:
|
||||
True on success.
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {api_token}"}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=NPM_TIMEOUT) as client:
|
||||
resp = await client.delete(
|
||||
f"{api_url}/nginx/proxy-hosts/{proxy_id}", headers=headers
|
||||
)
|
||||
if resp.status_code in (200, 204):
|
||||
logger.info("Deleted NPM proxy host %d", proxy_id)
|
||||
return True
|
||||
logger.warning(
|
||||
"Failed to delete proxy host %d: %s %s",
|
||||
proxy_id, resp.status_code, resp.text[:200],
|
||||
)
|
||||
return False
|
||||
except Exception as exc:
|
||||
logger.error("NPM delete error: %s", exc)
|
||||
return False
|
||||
110
app/services/port_manager.py
Normal file
110
app/services/port_manager.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""UDP port allocation service for NetBird relay/STUN ports.
|
||||
|
||||
Manages the range starting at relay_base_port (default 3478). Each customer
|
||||
gets one unique UDP port. The manager checks both the database and the OS
|
||||
to avoid collisions.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import socket
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import Deployment
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _is_udp_port_in_use(port: int) -> bool:
|
||||
"""Check whether a UDP port is currently bound on the host.
|
||||
|
||||
Args:
|
||||
port: UDP port number to probe.
|
||||
|
||||
Returns:
|
||||
True if the port is in use.
|
||||
"""
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
try:
|
||||
sock.bind(("0.0.0.0", port))
|
||||
return False
|
||||
except OSError:
|
||||
return True
|
||||
finally:
|
||||
sock.close()
|
||||
|
||||
|
||||
def get_allocated_ports(db: Session) -> set[int]:
|
||||
"""Return the set of relay UDP ports already assigned in the database.
|
||||
|
||||
Args:
|
||||
db: Active SQLAlchemy session.
|
||||
|
||||
Returns:
|
||||
Set of port numbers.
|
||||
"""
|
||||
rows = db.query(Deployment.relay_udp_port).all()
|
||||
return {r[0] for r in rows}
|
||||
|
||||
|
||||
def allocate_port(db: Session, base_port: int = 3478, max_ports: int = 100) -> int:
|
||||
"""Find and return the next available relay UDP port.
|
||||
|
||||
Scans from *base_port* to *base_port + max_ports - 1*, skipping ports
|
||||
that are either already in the database or currently bound on the host.
|
||||
|
||||
Args:
|
||||
db: Active SQLAlchemy session.
|
||||
base_port: Start of the port range.
|
||||
max_ports: Number of ports in the range.
|
||||
|
||||
Returns:
|
||||
An available port number.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If no port in the range is available.
|
||||
"""
|
||||
allocated = get_allocated_ports(db)
|
||||
for port in range(base_port, base_port + max_ports):
|
||||
if port in allocated:
|
||||
continue
|
||||
if _is_udp_port_in_use(port):
|
||||
logger.warning("Port %d is in use on the host, skipping.", port)
|
||||
continue
|
||||
logger.info("Allocated relay UDP port %d.", port)
|
||||
return port
|
||||
|
||||
raise RuntimeError(
|
||||
f"No available relay ports in range {base_port}-{base_port + max_ports - 1}. "
|
||||
"All 100 ports are allocated."
|
||||
)
|
||||
|
||||
|
||||
def release_port(db: Session, port: int) -> None:
|
||||
"""Mark a port as released (informational logging only).
|
||||
|
||||
The actual release happens when the Deployment row is deleted. This
|
||||
helper exists for explicit logging in rollback scenarios.
|
||||
|
||||
Args:
|
||||
db: Active SQLAlchemy session.
|
||||
port: The port to release.
|
||||
"""
|
||||
logger.info("Released relay UDP port %d.", port)
|
||||
|
||||
|
||||
def validate_port_available(db: Session, port: int) -> bool:
|
||||
"""Check if a specific port is available both in DB and on the host.
|
||||
|
||||
Args:
|
||||
db: Active SQLAlchemy session.
|
||||
port: Port number to check.
|
||||
|
||||
Returns:
|
||||
True if the port is available.
|
||||
"""
|
||||
allocated = get_allocated_ports(db)
|
||||
if port in allocated:
|
||||
return False
|
||||
return not _is_udp_port_in_use(port)
|
||||
Reference in New Issue
Block a user