Skip to content

Commit cb32733

Browse files
committed
Init commit
Signed-off-by: mYmNeo <[email protected]>
0 parents  commit cb32733

File tree

27,408 files changed

+8423714
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

27,408 files changed

+8423714
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
go/

Makefile

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
.PHONY: all
2+
all:
3+
hack/build.sh manager client
4+
5+
.PHONY: clean
6+
clean:
7+
rm -rf ./go
8+
9+
.PHONY: vendor
10+
vendor:
11+
rm -rf vendor
12+
hack/glide.sh
13+
14+
.PHONY: test
15+
test:
16+
hack/build.sh "test"
17+
18+
.PHONY: proto
19+
proto:
20+
hack/build.sh "proto"
21+
22+
.PHONY: img
23+
img:
24+
hack/build.sh "img"
25+
26+
.PHONY: fmt
27+
fmt:
28+
hack/build.sh "fmt"
29+
30+
.PHONY: lint
31+
lint:
32+
@revive -config revive.toml -exclude vendor/... -exclude pkg/api/runtime/... ./...

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# GPU Manager
2+
3+
GPU Manager is used for managing the nvidia GPU devices in Kubernetes cluster. It implements the `DevicePlugin` interface
4+
of Kubernetes. So it's compatible with 1.9+ of Kubernetes release version.
5+
6+
To compare with the combination solution of `nvidia-docker`
7+
and `nvidia-k8s-plugin`, GPU manager will use native `runc` without modification but nvidia solution does.
8+
Besides we also support metrics report without deploying new components.
9+
10+
To schedule a GPU payload correctly, GPU manager should work with `gpu-quota-admission` which is a kubernetes scheduler plugin.
11+
12+
GPU manager also supports the payload with fraction resource of GPU device such as 0.1 card or 100MiB gpu device memory.
13+
If you want this kind feature, please refer to `vcuda` project.
14+
15+
# How to deploy GPU Manager
16+
17+
GPU Manager is running as daemonset, and because of the RABC restriction and hydrid cluster,
18+
you need to do the following steps to make this daemonset run correctly.
19+
20+
- service account and clusterrole
21+
22+
```
23+
kubectl create sa gpu-manager -n kube-system
24+
kubectl create clusterrolebinding gpu-manager-role --clusterrole=cluster-admin --serviceaccount=kube-system:gpu-manager
25+
```
26+
27+
- label node with `nvidia-device-enable=enable`
28+
29+
```
30+
kubectl label node <node> nvidia-device-enable=enable
31+
```
32+
33+
- change gpu-manager.yaml and submit
34+
35+
change --incluster-mode from `false` to `true`, change image field to `<your repository>/public/gpu-manager:latest`, add serviceAccount filed to `gpu-manager-role`

VERSION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.2.0

build/Dockerfile

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
ARG base_img
2+
FROM nvidia/cuda:10.1-devel-centos7 as build
3+
4+
ARG version
5+
ARG commit
6+
7+
RUN yum install -y rpm-build make git
8+
9+
ENV GOLANG_VERSION 1.12.4
10+
RUN curl -sSL https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz \
11+
| tar -C /usr/local -xz
12+
ENV GOPATH /go
13+
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
14+
15+
RUN mkdir -p /root/rpmbuild/{SPECS,SOURCES}
16+
17+
COPY gpu-manager.spec /root/rpmbuild/SPECS
18+
COPY gpu-manager-source.tar.gz /root/rpmbuild/SOURCES
19+
20+
RUN echo '%_topdir /root/rpmbuild' > /root/.rpmmacros \
21+
&& echo '%__os_install_post %{nil}' >> /root/.rpmmacros \
22+
&& echo '%debug_package %{nil}' >> /root/.rpmmacros
23+
WORKDIR /root/rpmbuild/SPECS
24+
RUN rpmbuild -bb --quiet \
25+
--define 'version '${version}'' \
26+
--define 'commit '${commit}'' \
27+
gpu-manager.spec
28+
29+
FROM $base_img
30+
31+
ARG version
32+
ARG commit
33+
34+
COPY --from=build /root/rpmbuild/RPMS/x86_64/gpu-manager-${version}-${commit}.el7.x86_64.rpm /tmp
35+
36+
RUN yum install epel-release -y && \
37+
yum install -y which jq
38+
39+
# Install packages
40+
RUN rpm -ivh /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm \
41+
&& rm -rf /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm
42+
43+
# kubelet
44+
VOLUME ["/var/lib/kubelet/device-plugins"]
45+
46+
# gpu manager storage
47+
VOLUME ["/etc/gpu-manager/vm"]
48+
VOLUME ["/etc/gpu-manager/vdriver"]
49+
VOLUME ["/var/log/gpu-manager"]
50+
51+
# nvidia library search location
52+
VOLUME ["/usr/local/host"]
53+
54+
RUN echo "/usr/local/nvidia/lib" > /etc/ld.so.conf.d/nvidia.conf && \
55+
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
56+
57+
ENV PATH=$PATH:/usr/local/nvidia/bin
58+
59+
# cgroup
60+
VOLUME ["/sys/fs/cgroup"]
61+
62+
# display
63+
EXPOSE 5678
64+
65+
COPY start.sh /
66+
COPY copy-bin-lib.sh /
67+
68+
CMD ["/start.sh"]

build/copy-bin-lib.sh

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/bin/bash
2+
3+
set -o pipefail
4+
set -o errexit
5+
set -o nounset
6+
7+
FILE=${FILE:-"/etc/gpu-manager/volume.conf"}
8+
LIB_FILES=$(jq -r .volume[1].components.libraries[] ${FILE})
9+
BIN_FILES=$(jq -r .volume[1].components.binaries[] ${FILE})
10+
readonly NV_DIR="/usr/local/nvidia"
11+
readonly FIND_BASE=${FIND_BASE:-"/usr/local/host"}
12+
13+
function check_arch() {
14+
local readonly lib=$1
15+
if [[ $(objdump -f ${lib} | grep -o "elf64-x86-64") == "elf64-x86-64" ]]; then
16+
echo "64"
17+
else
18+
echo ""
19+
fi
20+
}
21+
22+
function copy_lib() {
23+
for target in $(find /usr -name "${1}*" | grep -v "stubs"); do
24+
if [[ $(objdump -p ${target} 2>/dev/null | grep -o "SONAME") == "SONAME" ]]; then
25+
copy_directory ${target} "${NV_DIR}/lib$(check_arch ${target})"
26+
fi
27+
done
28+
}
29+
30+
function copy_bin() {
31+
for target in $(find /usr -name "${1}"); do
32+
copy_directory ${target} "${NV_DIR}/bin/"
33+
done
34+
}
35+
36+
function copy_directory() {
37+
local readonly lib=$1
38+
local readonly path=$2
39+
40+
echo "copy ${lib} to ${path}"
41+
cp -Pf "${lib}" "${path}"
42+
}
43+
44+
rm -rf ${NV_DIR}
45+
mkdir -p ${NV_DIR}/{bin,lib,lib64}
46+
47+
for file in ${LIB_FILES[@]}; do
48+
copy_lib ${file}
49+
done
50+
51+
for file in ${BIN_FILES[@]}; do
52+
copy_bin ${file}
53+
done
54+
55+
# fix libvdpau_nvidia.so
56+
(
57+
cd ${NV_DIR}/lib
58+
rm -rf libvdpau_nvidia.so
59+
rel_path=$(readlink -f libvdpau_nvidia.so.1)
60+
ln -s $(basename ${rel_path}) libvdpau_nvidia.so
61+
)
62+
63+
(
64+
cd ${NV_DIR}/lib64
65+
rm -rf libvdpau_nvidia.so
66+
rel_path=$(readlink -f libvdpau_nvidia.so.1)
67+
ln -s $(basename ${rel_path}) libvdpau_nvidia.so
68+
)
69+
70+
# fix libnvidia-ml.so
71+
(
72+
cd ${NV_DIR}/lib
73+
rm -rf libnvidia-ml.so
74+
rel_path=$(readlink -f libnvidia-ml.so.1)
75+
ln -s $(basename ${rel_path}) libnvidia-ml.so
76+
)
77+
78+
(
79+
cd ${NV_DIR}/lib64
80+
rm -rf libnvidia-ml.so
81+
rel_path=$(readlink -f libnvidia-ml.so.1)
82+
ln -s $(basename ${rel_path}) libnvidia-ml.so
83+
)

build/extra-config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{}

build/gpu-manager.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
GPU_MANAGER_ARGS="--extra-config=/etc/gpu-manager/extra-config.json --addr=/var/run/gpu-manager.sock --v=2 --logtostderr"

build/gpu-manager.service

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
[Unit]
2+
Description=GPU Manager Runtime
3+
After=network-online.target docker.socket kubelet.service
4+
Wants=network-online.target kubelet.service
5+
6+
[Service]
7+
Type=notify
8+
# the default is not to use systemd for cgroups because the delegate issues still
9+
# exists and systemd currently does not support the cgroup feature set required
10+
# for containers run by docker
11+
EnvironmentFile=-/etc/gpu-manager/gpu-manager.conf
12+
ExecStart=/usr/bin/gpu-manager $GPU_MANAGER_ARGS
13+
ExecReload=/bin/kill -s HUP $MAINPID
14+
LimitNOFILE=1048576
15+
# Having non-zero Limit*s causes performance problems due to accounting overhead
16+
# in the kernel. We recommend using cgroups to do container-local accounting.
17+
LimitNPROC=infinity
18+
LimitCORE=infinity
19+
# Uncomment TasksMax if your systemd version supports it.
20+
# Only systemd 226 and above support this version.
21+
#TasksMax=infinity
22+
TimeoutStartSec=0
23+
# set delegate yes so that systemd does not reset the cgroups of docker containers
24+
Delegate=yes
25+
# kill only the docker process, not all processes in the cgroup
26+
KillMode=process
27+
# restart the docker process if it exits prematurely
28+
Restart=on-failure
29+
StartLimitBurst=3
30+
StartLimitInterval=60s
31+
UMask=0000
32+
33+
[Install]
34+
WantedBy=multi-user.target

build/gpu-manager.spec

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
Name: gpu-manager
2+
Version: %{version}
3+
Release: %{commit}%{?dist}
4+
Summary: GPU Manager Plugin for Kubernetes
5+
6+
License: MIT
7+
Source: gpu-manager-source.tar.gz
8+
9+
Requires: systemd-units
10+
11+
%define pkgname %{name}-%{version}-%{release}
12+
13+
%description
14+
GPU Manager Plugin for Kubernetes
15+
16+
%prep
17+
%setup -n gpu-manager-%{version}
18+
19+
20+
%build
21+
make all
22+
23+
%install
24+
install -d $RPM_BUILD_ROOT/%{_bindir}
25+
install -d $RPM_BUILD_ROOT/%{_unitdir}
26+
install -d $RPM_BUILD_ROOT/etc/gpu-manager
27+
28+
install -p -m 755 ./go/bin/gpu-manager $RPM_BUILD_ROOT/%{_bindir}/
29+
install -p -m 755 ./go/bin/gpu-client $RPM_BUILD_ROOT/%{_bindir}/
30+
31+
install -p -m 644 ./build/extra-config.json $RPM_BUILD_ROOT/etc/gpu-manager/
32+
install -p -m 644 ./build/gpu-manager.conf $RPM_BUILD_ROOT/etc/gpu-manager/
33+
install -p -m 644 ./build/volume.conf $RPM_BUILD_ROOT/etc/gpu-manager/
34+
35+
install -p -m 644 ./build/gpu-manager.service $RPM_BUILD_ROOT/%{_unitdir}/
36+
37+
%clean
38+
rm -rf $RPM_BUILD_ROOT
39+
40+
%files
41+
%config(noreplace,missingok) /etc/gpu-manager/extra-config.json
42+
%config(noreplace,missingok) /etc/gpu-manager/gpu-manager.conf
43+
%config(noreplace,missingok) /etc/gpu-manager/volume.conf
44+
45+
/%{_bindir}/gpu-manager
46+
/%{_bindir}/gpu-client
47+
48+
/%{_unitdir}/gpu-manager.service

build/start.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
set -o errexit
4+
set -o pipefail
5+
set -o nounset
6+
7+
source copy-bin-lib.sh
8+
9+
echo "rebuild ldcache"
10+
/usr/sbin/ldconfig
11+
12+
echo "launch gpu manager"
13+
/usr/bin/gpu-manager --extra-config=/etc/gpu-manager/extra-config.json --v=${LOG_LEVEL} --hostname-override=${NODE_NAME} --kubeconfig=/root/.kube/config --share-mode=true --volume-config=/etc/gpu-manager/volume.conf --log-dir=/var/log/gpu-manager --query-addr=0.0.0.0 ${EXTRA_FLAGS:-""}

0 commit comments

Comments
 (0)