Skip to content

Commit cb71779

Browse files
hohAmozPay
authored and
AmozPay
committed
Feature: System resources were not exposed
The scheduling of persistent VMs requires external services to fetch the available system resources of the host. Solution: Add a new HTTP endpoint on `/about/usage/system` that exposes system resources and system properties of the host machine.
1 parent db81f78 commit cb71779

File tree

5 files changed

+124
-2
lines changed

5 files changed

+124
-2
lines changed

.github/workflows/test-on-droplet.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ jobs:
6565
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci --output json | ./.github/scripts/extract_droplet_ipv4.py)"
6666
6767
sleep 3
68+
curl --retry 5 "http://${DROPLET_IPV4}:4020/about/usage/system"
6869
curl --retry 5 "http://${DROPLET_IPV4}:4020/status/check/fastapi"
6970
7071
- name: Cleanup

docker/vm_supervisor-dev.dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ FROM debian:bullseye
55
RUN apt-get update && apt-get -y upgrade && apt-get install -y \
66
sudo acl curl squashfs-tools git \
77
python3 python3-aiohttp python3-msgpack python3-pip python3-aiodns python3-aioredis \
8-
python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging \
8+
python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo \
99
&& rm -rf /var/lib/apt/lists/*
1010

1111
RUN useradd jailman

packaging/aleph-vm/DEBIAN/control

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@ Version: 0.1.8
33
Architecture: all
44
Maintainer: Aleph.im
55
Description: Aleph.im VM execution engine
6-
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging
6+
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo
77
Section: aleph-im
88
Priority: Extra

vm_supervisor/resources.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
from datetime import datetime, timezone
2+
from functools import lru_cache
3+
from typing import Tuple
4+
5+
import cpuinfo
6+
import psutil
7+
from aiohttp import web
8+
from aleph_message.models.program import CpuProperties
9+
from pydantic import BaseModel
10+
11+
from .conf import settings
12+
13+
14+
class Period(BaseModel):
15+
datetime: datetime
16+
17+
18+
class LoadAverage(BaseModel):
19+
load1: float
20+
load5: float
21+
load15: float
22+
23+
@classmethod
24+
def from_psutil(cls, psutil_loadavg: Tuple[float, float, float]):
25+
return cls(
26+
load1=psutil_loadavg[0],
27+
load5=psutil_loadavg[1],
28+
load15=psutil_loadavg[2],
29+
)
30+
31+
32+
class CoreFrequencies(BaseModel):
33+
min: float
34+
max: float
35+
36+
@classmethod
37+
def from_psutil(cls, psutil_freq: psutil._common.scpufreq):
38+
min = psutil_freq.min or psutil_freq.current
39+
max = psutil_freq.max or psutil_freq.current
40+
return cls(min=min, max=max)
41+
42+
43+
class CpuUsage(BaseModel):
44+
count: int
45+
load_average: LoadAverage
46+
core_frequencies: CoreFrequencies
47+
48+
49+
class MemoryUsage(BaseModel):
50+
total_kB: int
51+
available_kB: int
52+
53+
54+
class DiskUsage(BaseModel):
55+
total_kB: int
56+
available_kB: int
57+
58+
59+
class UsagePeriod(BaseModel):
60+
start_timestamp: datetime
61+
duration_seconds: float
62+
63+
64+
class MachineProperties(BaseModel):
65+
cpu: CpuProperties
66+
67+
68+
class MachineUsage(BaseModel):
69+
cpu: CpuUsage
70+
mem: MemoryUsage
71+
disk: DiskUsage
72+
period: UsagePeriod
73+
properties: MachineProperties
74+
active: bool = True
75+
76+
77+
@lru_cache
78+
def get_machine_properties() -> MachineProperties:
79+
"""Fetch machine properties such as architecture, CPU vendor, ...
80+
These should not change while the supervisor is running.
81+
82+
In the future, some properties may have to be fetched from within a VM.
83+
"""
84+
cpu_info = cpuinfo.get_cpu_info() # Slow
85+
return MachineProperties(
86+
cpu=CpuProperties(
87+
architecture=cpu_info["raw_arch_string"],
88+
vendor=cpu_info["vendor_id"],
89+
),
90+
)
91+
92+
93+
async def about_system_usage(request: web.Request):
94+
period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)
95+
96+
usage: MachineUsage = MachineUsage(
97+
cpu=CpuUsage(
98+
count=psutil.cpu_count(),
99+
load_average=LoadAverage.from_psutil(psutil.getloadavg()),
100+
core_frequencies=CoreFrequencies.from_psutil(psutil.cpu_freq()),
101+
),
102+
mem=MemoryUsage(
103+
total_kB=psutil.virtual_memory().total / 1000,
104+
available_kB=psutil.virtual_memory().available / 1000,
105+
),
106+
disk=DiskUsage(
107+
total_kB=psutil.disk_usage(settings.PERSISTENT_VOLUMES_DIR).total // 1000,
108+
available_kB=psutil.disk_usage(settings.PERSISTENT_VOLUMES_DIR).free
109+
// 1000,
110+
),
111+
period=UsagePeriod(
112+
start_timestamp=period_start,
113+
duration_seconds=60,
114+
),
115+
properties=get_machine_properties(),
116+
)
117+
return web.json_response(
118+
text=usage.json(exclude_none=True),
119+
)

vm_supervisor/supervisor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from . import __version__
1515
from . import metrics
1616
from .conf import settings
17+
from .resources import about_system_usage
1718
from .run import pool
1819
from .tasks import start_watch_for_messages_task, stop_watch_for_messages_task
1920
from .views import (
@@ -49,6 +50,7 @@ async def server_version_middleware(
4950
web.get("/about/login", about_login),
5051
web.get("/about/executions", about_executions),
5152
web.get("/about/executions/records", about_execution_records),
53+
web.get("/about/usage/system", about_system_usage),
5254
web.get("/about/config", about_config),
5355
web.get("/status/check/fastapi", status_check_fastapi),
5456
web.get("/status/check/version", status_check_version),

0 commit comments

Comments
 (0)