Skip to content

Commit dfd0ecc

Browse files
committed
Emulate the vsyscall page in userspace in the x86_64 Docker image
Since some recent distros are shipping with vsyscall=none by default, the manylinux1 Docker image doesn't work. Fortunately, we can emulate everything in userspace by catching segmentation faults for the vsyscall addresses and forcing the program to use the vDSO instead. Add an entrypoint to the x86_64 Docker image to detect whether this emulation is required, and if so, catch these segfaults via ptrace and adjust the instruction pointer. Using the ptrace syscall at all in recent versions of Docker requires docker run --security-opt=seccomp:unconfined (which an error message will tell you to do if needed). There is also a mode for the ptrace helper to trace an existing process and its children. Because `docker build` doesn't support the `--security-opt` option, this can be useful for building the manylinux1 image, by running this helper on docker-containerd.
1 parent f7356e3 commit dfd0ecc

File tree

5 files changed

+261
-0
lines changed

5 files changed

+261
-0
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ matrix:
2323
- env: PLATFORM="x86_64"
2424

2525
script:
26+
- make -C docker/vsyscall_emu
2627
- docker build --rm -t quay.io/pypa/manylinux1_$PLATFORM:$TRAVIS_COMMIT -f docker/Dockerfile-$PLATFORM docker/
2728

2829

docker/Dockerfile-x86_64

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,13 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
88
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib
99
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
1010

11+
COPY vsyscall_emu/vsyscall_trace /usr/local/sbin/vsyscall_trace
12+
1113
COPY build_scripts /build_scripts
1214
RUN bash build_scripts/build.sh && rm -r build_scripts
1315

1416
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
1517

18+
ENTRYPOINT ["/usr/local/sbin/vsyscall_trace"]
19+
1620
CMD ["/bin/bash"]

docker/vsyscall_emu/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
vsyscall_trace

docker/vsyscall_emu/Makefile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
ifeq ($(PLATFORM),x86_64)
2+
all: vsyscall_trace
3+
else
4+
all:
5+
endif
6+
7+
vsyscall_trace: vsyscall_trace.c
8+
$(CC) -o $@ $< -ldl
9+
10+
clean:
11+
$(RM) -f vsyscall_trace
12+
13+
.PHONY: clean

docker/vsyscall_emu/vsyscall_trace.c

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
/* Using ptrace, catch when a process in a process tree is about to
2+
* segfault from an attempted vsyscall, and fix it up to use the vDSO
3+
* instead.
4+
*
5+
* usage: vsyscall_trace -p <pid>...
6+
* vsyscall_trace <cmd> [args...]
7+
*
8+
* In the first mode, traces a process and all its children, until they
9+
* exit. In the second mode, run and trace a child process -- unless
10+
* vsyscalls are enabled, in which case it will just exec the child
11+
* process directly. Because the second mode waits on child processes (as
12+
* required by the ptrace API), it is usable as init inside a container.
13+
* Whether or not it runs as init, it will block until all descendant
14+
* processes exit.
15+
*
16+
* This program itself uses no vsyscalls, so it can be safely
17+
* dynamically linked against an older glibc.
18+
*/
19+
20+
#define _GNU_SOURCE
21+
#include <sys/auxv.h>
22+
#include <sys/ptrace.h>
23+
#include <sys/types.h>
24+
#include <sys/stat.h>
25+
#include <sys/wait.h>
26+
#include <sys/user.h>
27+
#include <dlfcn.h>
28+
#include <errno.h>
29+
#include <fcntl.h>
30+
#include <signal.h>
31+
#include <stdlib.h>
32+
#include <stdio.h>
33+
#include <string.h>
34+
#include <unistd.h>
35+
36+
#ifdef DEBUG
37+
#define debug_printf printf
38+
#else
39+
#define debug_printf(...) 0
40+
#endif
41+
42+
/* These are ABI constants: see arch/x86/include/uapi/asm/vsyscall.h
43+
* in the kernel source (probably installed on your system as
44+
* <asm/vsyscall.h>). They start at VSYSCALL_ADDR, and
45+
* increase by 1024 for each call. */
46+
const unsigned long VSYS_gettimeofday = 0xffffffffff600000,
47+
VSYS_time = 0xffffffffff600400,
48+
VSYS_getcpu = 0xffffffffff600800;
49+
50+
/* The vDSO is an area of memory that looks like a normal relocatable
51+
* dynamic library, magically placed in your address space by the
52+
* kernel. While it's mapped at a different address in each process when
53+
* ASLR is enabled, the relative offsets are the same, since the kernel
54+
* only contains one vDSO. These variables contain the relative offsets
55+
* as found in the current process. */
56+
unsigned long VDSO_gettimeofday, VDSO_time, VDSO_getcpu;
57+
58+
/* Look up the vDSO base address for a process in its auxiliary vector.
59+
* See proc(5) and getauxval(3). If we can ptrace the process, we should
60+
* have permissions to do this. */
61+
unsigned long vdso_address(pid_t pid) {
62+
char *filename;
63+
asprintf(&filename, "/proc/%d/auxv", pid);
64+
int fd = open(filename, O_RDONLY);
65+
if (fd == -1) {
66+
return 0;
67+
}
68+
unsigned long buf[128];
69+
int i;
70+
if (read(fd, buf, sizeof(buf)) == -1) {
71+
close(fd);
72+
return 0;
73+
}
74+
close(fd);
75+
free(filename);
76+
77+
for (i = 0; i < 128; i += 2) {
78+
if (buf[i] == AT_SYSINFO_EHDR) {
79+
return buf[i+1];
80+
} else if (buf[i] == 0) {
81+
return 0;
82+
}
83+
}
84+
}
85+
86+
/* If the ptraced process segfaulted because it tried to call one of the
87+
* three vsyscalls, redirect its instruction pointer to the
88+
* corresponding vDSO address. The calling conventions are the same, so
89+
* we don't need to change / inspect arguments or do any other safety
90+
* checks - the process could have gotten here on its own. */
91+
int handle_vsyscall(pid_t pid) {
92+
struct user_regs_struct regs;
93+
ptrace(PTRACE_GETREGS, pid, 0, &regs);
94+
if ((regs.rip & 0xfffffffffffff0ff) == 0xffffffffff600000) {
95+
debug_printf("handling vsyscall for %d\n", pid);
96+
unsigned long vdso = vdso_address(pid);
97+
if (vdso_address == 0) {
98+
debug_printf("couldn't find vdso\n");
99+
return 0;
100+
}
101+
102+
if (regs.rip == VSYS_gettimeofday) {
103+
regs.rip = vdso | VDSO_gettimeofday;
104+
} else if (regs.rip == VSYS_time) {
105+
regs.rip = vdso | VDSO_time;
106+
} else if (regs.rip == VSYS_getcpu) {
107+
regs.rip = vdso | VDSO_getcpu;
108+
} else {
109+
debug_printf("invalid vsyscall %x\n", regs.rip);
110+
return 0;
111+
}
112+
ptrace(PTRACE_SETREGS, pid, 0, &regs);
113+
return 1;
114+
}
115+
return 0;
116+
}
117+
118+
int main(int argc, char *argv[]) {
119+
pid_t pid, child_pid = 0;
120+
int wstatus, child_wstatus = 0;
121+
122+
if (argc < 2) {
123+
printf("usage: vsyscall_trace -p <pid>...\n");
124+
printf(" vsyscall_trace <cmd> [args...]\n");
125+
return 1;
126+
}
127+
128+
/* Seize all the processes via ptrace. We don't need to track
129+
* them, we only need to call wait(), and the options we're
130+
* passing to PTRACE_SEIZE will cause us to silently pick up
131+
* child processes too. */
132+
if (strcmp(argv[1], "-p") == 0) {
133+
int i;
134+
for (i = 2; i < argc; i++) {
135+
pid = atoi(argv[i]);
136+
if (ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE) != 0) {
137+
perror("PTRACE_SEIZE");
138+
return 1;
139+
}
140+
}
141+
} else {
142+
/* Test to see if vsyscalls work on this machine. If so,
143+
* we don't need to do anything - exec the given command
144+
* so we get entirely out of the way and don't risk
145+
* breaking the process. */
146+
child_pid = fork();
147+
if (child_pid == -1) {
148+
perror("fork");
149+
return 1;
150+
} else if (child_pid == 0) {
151+
((time_t (*)(time_t *))VSYS_time)(NULL);
152+
return 0;
153+
} else {
154+
waitpid(child_pid, &wstatus, 0);
155+
/* If the child process segfaulted, it will show
156+
* up as WIFSIGNALED instead of WIFEXITED. */
157+
if (WIFEXITED(wstatus)) {
158+
execvp(argv[1], &argv[1]);
159+
perror("execvp");
160+
return 1;
161+
}
162+
}
163+
164+
/* Actually start the child process. */
165+
child_pid = fork();
166+
if (child_pid == -1) {
167+
perror("fork");
168+
return 1;
169+
} else if (child_pid == 0) {
170+
/* Allow the parent process to run PTRACE_SEIZE
171+
* before continuing. */
172+
raise(SIGSTOP);
173+
execvp(argv[1], &argv[1]);
174+
perror("execvp");
175+
return 1;
176+
} else {
177+
if (ptrace(PTRACE_SEIZE, child_pid, 0, PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE) != 0) {
178+
if (errno == EPERM) {
179+
fprintf(stderr, "Error: no kernel vsyscall support and ptrace is disabled.\n");
180+
fprintf(stderr, "Your kernel does not provide vsyscall emulation, and we cannot\n");
181+
fprintf(stderr, "work around this because ptrace is prohibited inside this container.\n");
182+
fprintf(stderr, "Either permit ptrace for this container (e.g., for Docker, use\n");
183+
fprintf(stderr, "docker run --security-opt=seccomp:unconfined) or boot your kernel\n");
184+
fprintf(stderr, "with vsyscall=emulate.\n");
185+
} else {
186+
perror("PTRACE_SEIZE");
187+
}
188+
kill(child_pid, SIGKILL);
189+
return 1;
190+
}
191+
192+
fprintf(stderr, "Warning: using ptrace-based vsyscall emulation.\n");
193+
fprintf(stderr, "This container contains old binaries which require the use of the legacy\n");
194+
fprintf(stderr, "'vsyscall' feature of the Linux kernel, and your kernel does not provide\n");
195+
fprintf(stderr, "vsyscall emulation. We will attempt to emulate vsyscalls ourselves using\n");
196+
fprintf(stderr, "ptrace, but performance may suffer and other tools that use ptrace (e.g.,\n");
197+
fprintf(stderr, "gdb and strace) will not work.\n");
198+
fprintf(stderr, "To avoid this emulation, please boot your kernel with vsyscall=emulate.\n");
199+
kill(child_pid, SIGCONT);
200+
}
201+
}
202+
203+
/* The vDSO shows up as an object in our address space naemd
204+
* "linux-vdso.so.1" that's already been loaded. */
205+
void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_NOLOAD);
206+
VDSO_gettimeofday = (unsigned long)dlsym(vdso, "__vdso_gettimeofday") & 0xfff;
207+
VDSO_time = (unsigned long)dlsym(vdso, "__vdso_time") & 0xfff;
208+
VDSO_getcpu = (unsigned long)dlsym(vdso, "__vdso_getcpu") & 0xfff;
209+
210+
while ((pid = waitpid(-1, &wstatus, 0)) != -1) {
211+
if (WIFSTOPPED(wstatus)) {
212+
if (WSTOPSIG(wstatus) == SIGSEGV && handle_vsyscall(pid)) {
213+
/* The last argument to PTRACE_CONT is
214+
* the signal to send - passing 0 means
215+
* to suppress the signal. */
216+
ptrace(PTRACE_CONT, pid, 0, 0);
217+
} else {
218+
ptrace(PTRACE_CONT, pid, 0, WSTOPSIG(wstatus));
219+
}
220+
} else if (pid == child_pid && WIFEXITED(wstatus)) {
221+
/* Save this exit status so we can use it as our
222+
* own exit status. But don't exit yet if there
223+
* are further descendant processes still
224+
* running. */
225+
child_wstatus = wstatus;
226+
}
227+
}
228+
if (errno != ECHILD) {
229+
perror("waitpid");
230+
return 1;
231+
}
232+
if (WIFSIGNALED(wstatus)) {
233+
/* Send ourselves the same signal that killed the child
234+
* process, so our own parent process reports the right
235+
* exit status. */
236+
raise(WTERMSIG(wstatus));
237+
/* In case that signal is not fatal, return nonzero. */
238+
return 1;
239+
} else {
240+
return WEXITSTATUS(wstatus);
241+
}
242+
}

0 commit comments

Comments
 (0)