Files
NetBirdMSP-Appliance/app/services/docker_service.py
2026-02-07 12:18:20 +01:00

335 lines
11 KiB
Python

"""Docker container management via the Python Docker SDK.
Responsible for creating, starting, stopping, restarting, and removing
per-customer Docker Compose stacks. Also provides log retrieval and
container health/status information.
"""
import logging
import os
import subprocess
import time
from typing import Any, Optional
import docker
from docker.errors import DockerException, NotFound
logger = logging.getLogger(__name__)
def _get_client() -> docker.DockerClient:
"""Return a Docker client connected via the Unix socket.
Returns:
docker.DockerClient instance.
"""
return docker.from_env()
def compose_up(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose up -d`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name (e.g. ``netbird-kunde5``).
Returns:
True on success.
Raises:
RuntimeError: If ``docker compose up`` fails.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
if not os.path.isfile(compose_file):
raise FileNotFoundError(f"docker-compose.yml not found at {compose_file}")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"up", "-d", "--remove-orphans",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
logger.error("docker compose up failed: %s", result.stderr)
raise RuntimeError(f"docker compose up failed: {result.stderr}")
logger.info("docker compose up succeeded for %s", project_name)
return True
def compose_down(instance_dir: str, project_name: str, remove_volumes: bool = False) -> bool:
"""Run ``docker compose down`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
remove_volumes: Whether to also remove volumes.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"down",
]
if remove_volumes:
cmd.append("-v")
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
logger.warning("docker compose down returned non-zero: %s", result.stderr)
return True
def compose_stop(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose stop`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"stop",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.returncode == 0
def compose_start(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose start`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"start",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.returncode == 0
def compose_restart(instance_dir: str, project_name: str) -> bool:
"""Run ``docker compose restart`` for a customer instance.
Args:
instance_dir: Absolute path to the customer's instance directory.
project_name: Docker Compose project name.
Returns:
True on success.
"""
compose_file = os.path.join(instance_dir, "docker-compose.yml")
cmd = [
"docker", "compose",
"-f", compose_file,
"-p", project_name,
"restart",
]
logger.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
return result.returncode == 0
def get_container_status(container_prefix: str) -> list[dict[str, Any]]:
"""Get the status of all containers matching a prefix.
Args:
container_prefix: Container name prefix (e.g. ``netbird-kunde5``).
Returns:
List of dicts with container name, status, and health info.
"""
client = _get_client()
results: list[dict[str, Any]] = []
try:
containers = client.containers.list(all=True, filters={"name": container_prefix})
for c in containers:
health = "N/A"
if c.attrs.get("State", {}).get("Health"):
health = c.attrs["State"]["Health"].get("Status", "N/A")
results.append({
"name": c.name,
"status": c.status,
"health": health,
"image": str(c.image.tags[0]) if c.image.tags else str(c.image.id[:12]),
"created": c.attrs.get("Created", ""),
})
except DockerException as exc:
logger.error("Failed to get container status: %s", exc)
return results
def get_container_logs(container_name: str, tail: int = 200) -> str:
"""Retrieve recent logs from a container.
Args:
container_name: Full container name.
tail: Number of log lines to retrieve.
Returns:
Log text.
"""
client = _get_client()
try:
container = client.containers.get(container_name)
return container.logs(tail=tail, timestamps=True).decode("utf-8", errors="replace")
except NotFound:
return f"Container {container_name} not found."
except DockerException as exc:
return f"Error retrieving logs: {exc}"
def get_all_container_logs(container_prefix: str, tail: int = 100) -> dict[str, str]:
"""Get logs for all containers matching a prefix.
Args:
container_prefix: Container name prefix.
tail: Lines per container.
Returns:
Dict mapping container name to log text.
"""
client = _get_client()
logs: dict[str, str] = {}
try:
containers = client.containers.list(all=True, filters={"name": container_prefix})
for c in containers:
try:
logs[c.name] = c.logs(tail=tail, timestamps=True).decode(
"utf-8", errors="replace"
)
except DockerException:
logs[c.name] = "Error retrieving logs."
except DockerException as exc:
logger.error("Failed to list containers: %s", exc)
return logs
def wait_for_healthy(container_prefix: str, timeout: int = 60) -> bool:
"""Wait until all containers with the given prefix are running.
Args:
container_prefix: Container name prefix.
timeout: Maximum seconds to wait.
Returns:
True if all containers started within timeout.
"""
client = _get_client()
deadline = time.time() + timeout
while time.time() < deadline:
try:
containers = client.containers.list(
all=True, filters={"name": container_prefix}
)
if not containers:
time.sleep(2)
continue
all_running = all(c.status == "running" for c in containers)
if all_running:
logger.info("All containers for %s are running.", container_prefix)
return True
except DockerException as exc:
logger.warning("Health check error: %s", exc)
time.sleep(3)
logger.warning("Timeout waiting for %s containers to start.", container_prefix)
return False
def get_docker_stats(container_prefix: str) -> list[dict[str, Any]]:
"""Retrieve resource usage stats for containers matching a prefix.
Args:
container_prefix: Container name prefix.
Returns:
List of dicts with CPU, memory, and network stats.
"""
client = _get_client()
stats_list: list[dict[str, Any]] = []
try:
containers = client.containers.list(filters={"name": container_prefix})
for c in containers:
try:
raw = c.stats(stream=False)
cpu_delta = (
raw.get("cpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
- raw.get("precpu_stats", {}).get("cpu_usage", {}).get("total_usage", 0)
)
system_delta = (
raw.get("cpu_stats", {}).get("system_cpu_usage", 0)
- raw.get("precpu_stats", {}).get("system_cpu_usage", 0)
)
num_cpus = len(
raw.get("cpu_stats", {}).get("cpu_usage", {}).get("percpu_usage", [1])
)
cpu_pct = 0.0
if system_delta > 0:
cpu_pct = (cpu_delta / system_delta) * num_cpus * 100
mem_usage = raw.get("memory_stats", {}).get("usage", 0)
mem_limit = raw.get("memory_stats", {}).get("limit", 1)
stats_list.append({
"name": c.name,
"cpu_percent": round(cpu_pct, 2),
"memory_usage_mb": round(mem_usage / 1024 / 1024, 1),
"memory_limit_mb": round(mem_limit / 1024 / 1024, 1),
"memory_percent": round((mem_usage / mem_limit) * 100, 1) if mem_limit else 0,
})
except DockerException:
stats_list.append({"name": c.name, "error": "Failed to get stats"})
except DockerException as exc:
logger.error("Failed to get docker stats: %s", exc)
return stats_list
def remove_instance_containers(container_prefix: str) -> bool:
"""Force-remove all containers matching a prefix.
Args:
container_prefix: Container name prefix.
Returns:
True if removal succeeded.
"""
client = _get_client()
try:
containers = client.containers.list(all=True, filters={"name": container_prefix})
for c in containers:
logger.info("Removing container %s", c.name)
c.remove(force=True)
return True
except DockerException as exc:
logger.error("Failed to remove containers: %s", exc)
return False