HEX

File: //snap/docker/3265/usr/share/nvidia-container-toolkit/lib
# nvidia toolkit related setup funtions #

U_MACHINE="$(uname -m)"
U_OS="$(uname -o)"
U_KERNEL="${U_OS##*/}"
U_USERLAND="${U_OS%%/*}"

ARCH_TRIPLET="${U_MACHINE}-${U_KERNEL,,}-${U_USERLAND,,}"

NVIDIA_SUPPORT_DISABLED="$(snapctl get nvidia-support.disabled)"

if [ -L "/var/lib/snapd/hostfs/usr/lib/${ARCH_TRIPLET}/libcuda.so" ] ; then
    NVIDIA_SUPPORT_CLASSIC="true"
else
    NVIDIA_SUPPORT_CLASSIC="false"
fi

device_wait() {

    COUNT=0
    SLEEP=3
    TRIES=10

    echo "Waiting for device to become available: ${1}"

    while [ ${COUNT} -le ${TRIES} ] ; do
        echo "Checking device: ${COUNT}/${TRIES}"
        test -c "${1}"
        if [ $? -eq 0 ] ; then
            echo "Device found"
            return 0
        fi
        sleep $SLEEP
        COUNT=$(($COUNT + 1))
    done

    echo "Device not found"
    return 1

}

# Check if hardware is present - just exit if not #
nvidia_hw_ensure() {
    lspci -d 10de: | grep -q 'NVIDIA Corporation' || exit 0
    echo "NVIDIA hardware detected: $(lspci -d 10de:)"
}

# Create any data dirs if missing #
ensure_nvidia_data_dirs() {
    mkdir -p "${SNAP_DATA}/etc/cdi"
    mkdir -p "${SNAP_DATA}/etc/nvidia-container-runtime"
}

# Generate the CDI config #
cdi_generate () {
    # Allow configured device-name-strategy, or default to index [ default in nvidia-ctk ] #
    CDI_DEVICE_NAME_STRATEGY="$(snapctl get nvidia-support.cdi.device-name-strategy)"
    CDI_DEVICE_NAME_STRATEGY="${CDI_DEVICE_NAME_STRATEGY:-index}"

    # Default CDI libs search path and shell path for install on core systems #
    CDI_LIB_SEARCH_PATH="${SNAP}/graphics/lib/${ARCH_TRIPLET}"
    CDI_CONFIG_SEARCH_PATH="${SNAP}/graphics/share"
    CDI_PATH="${PATH}:${SNAP}/graphics/bin"
    # Otherwise, if on classic and nvidia driver is installed, set hostfs for the CDI libs search path and shell path #
    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && CDI_LIB_SEARCH_PATH="/var/lib/snapd/hostfs/usr/lib/${ARCH_TRIPLET}"
    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && CDI_CONFIG_SEARCH_PATH="/var/lib/snapd/hostfs/usr/share"
    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && CDI_PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin"

    # Generate the CDI spec
    XDG_DATA_DIRS="${XDG_DATA_DIRS:-}:${CDI_CONFIG_SEARCH_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CDI_LIB_SEARCH_PATH}" PATH="${CDI_PATH}" \
     "${SNAP}/usr/bin/nvidia-ctk" cdi generate \
     --nvidia-ctk-path "${SNAP}/usr/bin/nvidia-ctk" \
     --library-search-path "${CDI_LIB_SEARCH_PATH}" \
     --device-name-strategy "${CDI_DEVICE_NAME_STRATEGY}" \
     --output "${SNAP_DATA}/etc/cdi/nvidia.yaml"

    if [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] ; then
        # Replace container path for binaries such as nvidia-smi to make them discoverable from the default PATH
        sed -i "s|containerPath: /var/lib/snapd/hostfs/usr/bin|containerPath: /usr/bin|g" "${SNAP_DATA}/etc/cdi/nvidia.yaml"
    fi
}

# Create the nvidia runtime config, either snap default or custom #
nvidia_runtime_config () {
    RUNTIME_CONFIG_OVERRIDE="$(snapctl get nvidia-support.runtime.config-override)"

    # Custom #
    if [ -n "${RUNTIME_CONFIG_OVERRIDE}" ] ; then
        echo "${RUNTIME_CONFIG_OVERRIDE}" > "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
    # Default - opinionated, but most viable option for now #
    else
        # FIXME: CDI spec-dirs can be set is a list using `"${SNAP_DATA}/etc/cdi",/var/run/cdi`, once this is fixed: https://github.com/NVIDIA/nvidia-container-toolkit/issues/466
        rm -f "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
        "${SNAP}/usr/bin/nvidia-ctk" config --in-place --set nvidia-container-runtime.mode=cdi --set nvidia-container-runtime.modes.cdi.spec-dirs="${SNAP_DATA}/etc/cdi" --config "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
    fi
}

# Generate the dockerd runtime config #
docker_runtime_configure () {
    "${SNAP}/usr/bin/nvidia-ctk" runtime configure --runtime=docker --runtime-path "${SNAP}/usr/bin/nvidia-container-runtime" --config "${SNAP_DATA}/config/daemon.json"
}

# Setup failure recovery #
setup_fail () {
    echo "WARNING: Conainter Toolkit setup seemed to fail with an error"

    # Remove nvidia runtime config, if it exists #
    jq -r 'del(.runtimes.nvidia)' "${SNAP_DATA}/config/daemon.json" > "${SNAP_DATA}/config/daemon.json.new"

    # If it was removed [ there was a change ], copy in the new config, remove CDI config,  and set service restart flag #
    if ! cmp "${SNAP_DATA}/config/daemon.json"{,.new} >/dev/null ; then
        mv "${SNAP_DATA}/config/daemon.json"{.new,}
        rm -f "${SNAP_DATA}/etc/cdi/nvidia.yaml"
        rm -f "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
    fi
}

# Info #
setup_info () {
    echo "Conainter Toolkit setup complete"
}