Hello there!
I’m trying to get CUDA working inside a nixos-container. It works when starting the cuda application manually inside a nixos-container root-login session, but fails when ran automatically as systemd service.
I’ve created a minimal container config to reproduce this behavior:
Host (on nixos-unstable):
hardware.graphics = {
enable = true;
extraPackages = with pkgs; [
nvidia-vaapi-driver
];
};
hardware.nvidia.open = true;
services.xserver.videoDrivers = [ "nvidia" ];
Container flake:
{
inputs = {
host.url = "path:/etc/nixos";
nixpkgs.follows = "host/nixpkgs";
# nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
};
nixConfig = {
extra-substituters = [
"https://nix-community.cachix.org"
"https://cuda-maintainers.cachix.org"
];
extra-trusted-public-keys = [
"nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
];
};
outputs = inputs: {
nixosConfigurations.container = inputs.nixpkgs.lib.nixosSystem {
specialArgs = {
inherit inputs;
};
modules = [
(
{ pkgs, ... }@args:
let
python = (
pkgs.python3.withPackages (
python-pkgs: with python-pkgs; [
torch
numpy
]
)
);
in
{
boot.isContainer = true;
nixpkgs.hostPlatform = "x86_64-linux";
system.stateVersion = args.config.system.nixos.release; # Don't actually do this for production of course
nixpkgs.config.allowUnfree = true;
nixpkgs.config.cudaSupport = true;
hardware.graphics = {
enable = true;
extraPackages = [
pkgs.nvidia-vaapi-driver
];
};
hardware.nvidia.open = true;
services.xserver.videoDrivers = [ "nvidia" ];
environment.systemPackages = [ python ];
systemd.services.cuda-test = {
description = "Cuda Test";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Type = "exec";
User = "root";
Group = "root";
};
path = [ python ];
script = ''
cd /root
mkdir -p cudatest
cat << EOF > cudatest/main.py
from torch import cuda
print(cuda.is_available())
print(cuda.device_count())
print(cuda.get_device_name(cuda.current_device()))
EOF
cat << EOF > cudatest/run.sh
export CUDA_PATH=${pkgs.cudatoolkit};
export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib;${pkgs.linuxPackages.nvidiaPackages.stable}/lib";
python cudatest/main.py
EOF
chmod +x cudatest/run.sh
./cudatest/run.sh
'';
};
}
)
];
};
};
}
Run the following commands
sudo nixos-container create cudatest --flake ./container/flake/folderecho -n "" | sudo tee /etc/nixos-containers/cudatest.conf (disable private networking)sudo nixos-container start cudatestsudo nixos-container root-login cudatestjournalctl -u cuda-test.service- (CTRL + C)
./cudatest/run.sh
Logs of the systemd service:
Aug 06 15:43:37 nixos systemd[1]: Started Cuda Test.
Aug 06 15:43:38 nixos cuda-test-start[285]: /nix/store/3krfkjlfvdcag4rx3va0zg0zf1k0qn15-python3-3.13.4-env/lib/python3.13/site-packages/torch/cuda/__init__.py:789: UserWarning: Can't initialize NVML
Aug 06 15:43:38 nixos cuda-test-start[285]: warnings.warn("Can't initialize NVML")
Aug 06 15:43:38 nixos cuda-test-start[285]: False
Aug 06 15:43:38 nixos cuda-test-start[285]: 0
Aug 06 15:43:38 nixos cuda-test-start[285]: Traceback (most recent call last):
Aug 06 15:43:38 nixos cuda-test-start[285]: File "/root/cudatest/main.py", line 4, in <module>
Aug 06 15:43:38 nixos cuda-test-start[285]: print(cuda.get_device_name(cuda.current_device()))
Aug 06 15:43:38 nixos cuda-test-start[285]: ~~~~~~~~~~~~~~~~~~~^^
Aug 06 15:43:38 nixos cuda-test-start[285]: File "/nix/store/3krfkjlfvdcag4rx3va0zg0zf1k0qn15-python3-3.13.4-env/lib/python3.13/site-packages/torch/cuda/__init__.py", line 1026, in current_device
Aug 06 15:43:38 nixos cuda-test-start[285]: _lazy_init()
Aug 06 15:43:38 nixos cuda-test-start[285]: ~~~~~~~~~~^^
Aug 06 15:43:38 nixos cuda-test-start[285]: File "/nix/store/3krfkjlfvdcag4rx3va0zg0zf1k0qn15-python3-3.13.4-env/lib/python3.13/site-packages/torch/cuda/__init__.py", line 372, in _lazy_init
Aug 06 15:43:38 nixos cuda-test-start[285]: torch._C._cuda_init()
Aug 06 15:43:38 nixos cuda-test-start[285]: ~~~~~~~~~~~~~~~~~~~^^
Aug 06 15:43:38 nixos cuda-test-start[285]: RuntimeError: No CUDA GPUs are available
Aug 06 15:43:38 nixos systemd[1]: cuda-test.service: Main process exited, code=exited, status=1/FAILURE
Logs of manual start:
True
1
NVIDIA GeForce RTX 3060
My guess is that it’s a permission error, I’ve attempted to reproduce systemd-nspawn - ArchWiki in a declarative container config (that you put in the host config, as it has more configuration options):
containers.declarative-cudatest = {
allowedDevices = [
{
node = "/dev/dri";
modifier = "rw";
}
{
node = "/dev/shm";
modifier = "rw";
}
{
node = "/dev/nvidia0";
modifier = "rw";
}
{
node = "/dev/nvidiactl";
modifier = "rw";
}
{
node = "/dev/nvidia-modeset";
modifier = "rw";
}
];
extraFlags = [
"--bind=/dev/dri"
"--bind=/dev/shm"
"--bind=/dev/nvidia0"
"--bind=/dev/nvidiactl"
"--bind=/dev/nvidia-modeset"
];
config =
{ pkgs, ... }@args:
let
python = (
pkgs.python3.withPackages (
python-pkgs: with python-pkgs; [
torch
numpy
]
)
);
in
{
boot.isContainer = true;
system.stateVersion = args.config.system.nixos.release; # Don't actually do this for production of course
nixpkgs.config.allowUnfree = true;
nixpkgs.config.cudaSupport = true;
hardware.graphics = {
enable = true;
extraPackages = [
pkgs.nvidia-vaapi-driver
];
};
hardware.nvidia.open = true;
services.xserver.videoDrivers = [ "nvidia" ];
environment.systemPackages = [ python ];
systemd.services.cuda-test = {
description = "Cuda Test";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Type = "exec";
User = "root";
Group = "root";
};
path = [ python ];
script = ''
cd /root
mkdir -p cudatest
cat << EOF > cudatest/main.py
from torch import cuda
print(cuda.is_available())
print(cuda.device_count())
print(cuda.get_device_name(cuda.current_device()))
EOF
cat << EOF > cudatest/run.sh
export CUDA_PATH=${pkgs.cudatoolkit};
export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib;${pkgs.linuxPackages.nvidiaPackages.stable}/lib";
python cudatest/main.py
EOF
chmod +x cudatest/run.sh
./cudatest/run.sh
'';
};
};
};
This results in the following output:
Aug 06 15:42:36 declarative-cudatest systemd[1]: Started Cuda Test.
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: /nix/store/3krfkjlfvdcag4rx3va0zg0zf1k0qn15-python3-3.13.4-env/lib/python3.13/site-packages/torch/cuda/__init__.py:174: UserWarning: CUDA initializa>
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: return torch._C._cuda_getDeviceCount() > 0
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: False
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: 1
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: Traceback (most recent call last):
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: File "/root/cudatest/main.py", line 4, in <module>
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: print(cuda.get_device_name(cuda.current_device()))
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: ~~~~~~~~~~~~~~~~~~~^^
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: File "/nix/store/3krfkjlfvdcag4rx3va0zg0zf1k0qn15-python3-3.13.4-env/lib/python3.13/site-packages/torch/cuda/__init__.py", line 1026, in current_d>
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: _lazy_init()
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: ~~~~~~~~~~^^
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: File "/nix/store/3krfkjlfvdcag4rx3va0zg0zf1k0qn15-python3-3.13.4-env/lib/python3.13/site-packages/torch/cuda/__init__.py", line 372, in _lazy_init
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: torch._C._cuda_init()
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: ~~~~~~~~~~~~~~~~~~~^^
Aug 06 15:42:40 declarative-cudatest cuda-test-start[267]: RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after progr>
Aug 06 15:42:40 declarative-cudatest systemd[1]: cuda-test.service: Main process exited, code=exited, status=1/FAILURE
(cuda devices is being reported as 1)
My usecase is to run GPU accelerated ollama in a nixos-container, similar to this user: Nixos-container permission problem although I’m using a nvidia GPU. Running on the host all applications work without issues.
Some more users with similar issues I found:
- Reddit - The heart of the internet (but caused by systemd hardening on the host, not in a container)
- Reddit - The heart of the internet (possibly another distro and suggested to be a user permission issue, I assume this is not my case, considering manual execution as the same user (root in container) works)
Any suggestions on what can resolve this issue are greatly appreciated!