修复docker下目录权限问题

This commit is contained in:
Titan 2026-03-09 10:54:40 +08:00
parent 8f356cdf51
commit 28bb208469
3 changed files with 167 additions and 13 deletions

View File

@ -16,6 +16,7 @@ import atexit
import hashlib
import logging
import os
import re
import signal
import threading
import time
@ -197,28 +198,53 @@ class AioSandboxProvider(SandboxProvider):
return mounts
@staticmethod
def _get_thread_mounts(thread_id: str) -> list[tuple[str, str, bool]]:
"""Get volume mounts for a thread's data directories.
@classmethod
def _ensure_thread_mount_dirs(cls, thread_id: str) -> list[tuple[str, str, bool]]:
"""Ensure thread data mount directories exist and are writable."""
base_dir = Path(os.getcwd())
thread_dir = base_dir / THREAD_DATA_BASE_DIR / thread_id / "user-data"
host_thread_dir = cls._resolve_host_bind_path(thread_dir)
Creates directories if they don't exist (lazy initialization).
"""
base_dir = os.getcwd()
thread_dir = Path(base_dir) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
if str(host_thread_dir) != str(thread_dir):
logger.info(
"Resolved thread mount source from %s to host path %s",
thread_dir,
host_thread_dir,
)
# Ensure the root user-data directory exists and is writable for
# sandbox runtimes that run as non-root users.
os.makedirs(host_thread_dir, exist_ok=True)
try:
os.chmod(host_thread_dir, 0o777)
except OSError as e:
logger.warning(f"Could not chmod thread user-data dir {host_thread_dir}: {e}")
mounts = [
(str(thread_dir / "workspace"), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
(str(thread_dir / "uploads"), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
(str(thread_dir / "outputs"), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
(str(host_thread_dir / "workspace"), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
(str(host_thread_dir / "uploads"), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
(str(host_thread_dir / "outputs"), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
]
for host_path, _, _ in mounts:
os.makedirs(host_path, exist_ok=True)
try:
os.chmod(host_path, 0o777)
except OSError as e:
logger.warning(f"Could not chmod thread mount dir {host_path}: {e}")
return mounts
@staticmethod
def _get_skills_mount() -> tuple[str, str, bool] | None:
@classmethod
def _get_thread_mounts(cls, thread_id: str) -> list[tuple[str, str, bool]]:
"""Get volume mounts for a thread's data directories.
Creates directories if they don't exist (lazy initialization).
"""
return cls._ensure_thread_mount_dirs(thread_id)
@classmethod
def _get_skills_mount(cls) -> tuple[str, str, bool] | None:
"""Get the skills directory mount configuration."""
try:
config = get_app_config()
@ -226,11 +252,73 @@ class AioSandboxProvider(SandboxProvider):
container_path = config.skills.container_path
if skills_path.exists():
return (str(skills_path), container_path, True) # Read-only for security
host_skills_path = cls._resolve_host_bind_path(skills_path)
if str(host_skills_path) != str(skills_path):
logger.info(
"Resolved skills bind source from %s to host path %s",
skills_path,
host_skills_path,
)
return (str(host_skills_path), container_path, True) # Read-only for security
except Exception as e:
logger.warning(f"Could not setup skills mount: {e}")
return None
@staticmethod
def _decode_mountinfo_path(path: str) -> str:
"""Decode escaped mountinfo paths (e.g. ``\040`` -> space)."""
return re.sub(r"\\([0-7]{3})", lambda m: chr(int(m.group(1), 8)), path)
@classmethod
def _resolve_host_bind_path(cls, path: Path) -> Path:
"""Resolve a container-visible bind path to its host source path.
This is needed when running gateway/langgraph inside Docker while using
the host Docker socket to start sandbox containers. In that scenario,
bind sources passed to Docker must be host paths, not paths inside the
current container.
If resolution fails, returns the original path.
"""
try:
target = str(path.resolve())
except Exception:
target = str(path)
try:
with open("/proc/self/mountinfo") as f:
lines = f.readlines()
except Exception:
return path
best_mount_point: str | None = None
best_root: str | None = None
for line in lines:
pre, _, _ = line.partition(" - ")
fields = pre.split()
if len(fields) < 5:
continue
# Fields: ... root mount_point ...
root = cls._decode_mountinfo_path(fields[3])
mount_point = cls._decode_mountinfo_path(fields[4])
if target == mount_point or target.startswith(f"{mount_point.rstrip('/')}/"):
if best_mount_point is None or len(mount_point) > len(best_mount_point):
best_mount_point = mount_point
best_root = root
if best_mount_point is None or best_root is None:
return path
rel = target[len(best_mount_point) :].lstrip("/")
if rel:
return Path(best_root) / rel
return Path(best_root)
# ── Idle timeout management ──────────────────────────────────────────
def _start_idle_checker(self) -> None:
@ -331,6 +419,11 @@ class AioSandboxProvider(SandboxProvider):
Layer 2: Cross-process state store + file lock (covers multi-process)
Layer 3: Backend discovery (covers containers started by other processes)
"""
if thread_id:
# Best-effort self-heal for existing threads/sandboxes: make sure
# mounted directories are writable by non-root users inside sandbox.
self._ensure_thread_mount_dirs(thread_id)
# ── Layer 1: In-process cache (fast path) ──
if thread_id:
with self._lock:

View File

@ -8,6 +8,7 @@ from __future__ import annotations
import logging
import subprocess
import time
from src.utils.network import get_free_port, release_port
@ -107,6 +108,7 @@ class LocalContainerBackend(SandboxBackend):
port = get_free_port(start_port=self._base_port)
try:
container_id = self._start_container(container_name, port, extra_mounts)
self._ensure_user_data_permissions(container_name)
except Exception:
release_port(port)
raise
@ -121,6 +123,40 @@ class LocalContainerBackend(SandboxBackend):
container_id=container_id,
)
def _ensure_user_data_permissions(self, container_name: str) -> None:
"""Ensure /mnt/user-data subdirectories are writable in sandbox container.
Some sandbox services run as non-root users (e.g. ``gem``). If mounted
host directories are created as ``755 root:root``, uploads may fail with
permission denied. This best-effort fix normalizes permissions.
"""
fix_cmd = (
"mkdir -p /mnt/user-data/uploads /mnt/user-data/workspace /mnt/user-data/outputs "
"&& chmod 777 /mnt/user-data/uploads /mnt/user-data/workspace /mnt/user-data/outputs"
)
# Retry briefly because the init process may still be setting up paths
# right after container startup.
for _ in range(5):
try:
subprocess.run(
[self._runtime, "exec", container_name, "sh", "-lc", fix_cmd],
capture_output=True,
text=True,
check=True,
timeout=5,
)
return
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
logger.debug(f"Retrying user-data permission fix for {container_name}: {e}")
time.sleep(0.3)
logger.warning(
"Failed to ensure user-data permissions for %s; uploads may fail until permissions are fixed",
container_name,
)
def destroy(self, info: SandboxInfo) -> None:
"""Stop the container and release its port."""
if info.container_id:

View File

@ -225,6 +225,31 @@ def _build_pod(sandbox_id: str, thread_id: str) -> k8s_client.V1Pod:
},
),
spec=k8s_client.V1PodSpec(
init_containers=[
k8s_client.V1Container(
name="init-user-data-permissions",
image=SANDBOX_IMAGE,
image_pull_policy="IfNotPresent",
command=[
"/bin/sh",
"-c",
"mkdir -p /mnt/user-data/workspace /mnt/user-data/uploads /mnt/user-data/outputs && chmod -R 0777 /mnt/user-data",
],
volume_mounts=[
k8s_client.V1VolumeMount(
name="user-data",
mount_path="/mnt/user-data",
read_only=False,
),
],
security_context=k8s_client.V1SecurityContext(
run_as_user=0,
run_as_group=0,
privileged=False,
allow_privilege_escalation=False,
),
)
],
containers=[
k8s_client.V1Container(
name="sandbox",