Skip to content

kola: Add soft-reboot support for external tests #4119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions docs/kola/external-tests.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,37 @@ it out.
(Previously the API for this was to send `SIGTERM` to the current process; that
method is deprecated and will be removed at some point)

## Support for soft-rebooting

Kola also supports soft-rebooting using systemd's `systemctl soft-reboot` command.
Soft-reboot restarts the userspace while keeping the kernel and hardware state intact.
This is useful for testing userspace updates without a full system reboot.

The soft-reboot API is similar to the regular reboot API:

```
#!/bin/bash
# Example of soft-reboot test
set -xeuo pipefail
case "${AUTOPKGTEST_REBOOT_MARK:-}" in
"") echo "test beginning"; /tmp/autopkgtest-soft-reboot mark1 ;;
mark1) echo "test in mark1"; /tmp/autopkgtest-soft-reboot mark2 ;;
mark2) echo "test in mark2" ;;
*) echo "unexpected mark: ${AUTOPKGTEST_REBOOT_MARK}"; exit 1;;
esac
echo "ok autopkgtest soft-rebooting"
```

Key differences with soft-reboot:
- The kernel boot ID (`/proc/sys/kernel/random/boot_id`) remains the same
- Hardware state and kernel memory are preserved
- `/run` is not cycled.
- Only userspace is restarted
- Uses `systemctl soft-reboot` instead of `reboot`

Both `/tmp/autopkgtest-soft-reboot` and `/tmp/autopkgtest-soft-reboot-prepare` scripts are available,
analogous to their regular reboot counterparts.

## HTTP Server

The `kolet` binary is copied into the `/usr/local/bin/` directory on the CoreOS
Expand Down
12 changes: 12 additions & 0 deletions mantle/cmd/kola/devshell.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ func runDevShellSSH(ctx context.Context, builder *platform.QemuBuilder, conf *co
_ = inst.Kill()
case guestStateInReboot:
statusMsg = "QEMU guest initiated reboot"
case guestStateInSoftReboot:
statusMsg = "QEMU guest initiated soft-reboot"
case guestStateOpenSshStopped:
statusMsg = "QEMU openssh is not listening"
case guestStateSshDisconnected:
Expand Down Expand Up @@ -285,6 +287,8 @@ const (
guestStateInShutdown
// guestStateInReboot indicates that the guest has started a reboot
guestStateInReboot
// guestStateInSoftReboot indicates that the guest has started a soft-reboot
guestStateInSoftReboot
// guestStateHalted indicates that the guest has halted or shutdown
guestStateHalted
// guestStateBooting indicates that the instance is in early boot
Expand Down Expand Up @@ -325,6 +329,9 @@ func checkWriteState(msg string, c chan<- guestState) {
if strings.Contains(msg, "Starting Reboot...") {
c <- guestStateInReboot
}
if strings.Contains(msg, "Reached target soft-reboot") {
c <- guestStateInSoftReboot
}
}

type systemdEventMessage struct {
Expand Down Expand Up @@ -428,6 +435,11 @@ func watchJournal(builder *platform.QemuBuilder, conf *conf.Conf, stateChan chan
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
guestState: guestStateInShutdown,
},
{
unit: "systemd-soft-reboot.service",
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
guestState: guestStateInSoftReboot,
},
}

r, err := builder.VirtioJournal(conf, "-o json --system")
Expand Down
103 changes: 102 additions & 1 deletion mantle/cmd/kolet/kolet.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,25 @@ reboot
autopkgtestRebootPrepareScript = `#!/bin/bash
set -euo pipefail
exec /usr/local/bin/kolet reboot-request "$1"
`

// Soft-reboot support
autopkgTestSoftRebootPath = "/tmp/autopkgtest-soft-reboot"
autopkgtestSoftRebootScript = `#!/bin/bash
set -xeuo pipefail
/usr/local/bin/kolet soft-reboot-request "$1"
systemctl soft-reboot
`
autopkgTestSoftRebootPreparePath = "/tmp/autopkgtest-soft-reboot-prepare"

autopkgtestSoftRebootPrepareScript = `#!/bin/bash
set -euo pipefail
exec /usr/local/bin/kolet soft-reboot-request "$1"
`

// File used to communicate between the script and the kolet runner internally
rebootRequestFifo = "/run/kolet-reboot"
rebootRequestFifo = "/run/kolet-reboot"
softRebootRequestFifo = "/run/kolet-soft-reboot"
)

var (
Expand Down Expand Up @@ -140,6 +155,13 @@ var (
SilenceUsage: true,
}

cmdSoftReboot = &cobra.Command{
Use: "soft-reboot-request MARK",
Short: "Request a soft reboot",
RunE: runSoftReboot,
SilenceUsage: true,
}

cmdHttpd = &cobra.Command{
Use: "httpd",
Short: "Start an HTTP server to serve the contents of the file system",
Expand Down Expand Up @@ -260,6 +282,11 @@ func initiateReboot(mark string) error {
}

func mkfifo(path string) error {
// Create a FIFO in an idempotent fashion
// as /run survives soft-reboots.
if _, err := os.Stat(path); err == nil {
return nil
}
c := exec.Command("mkfifo", path)
c.Stderr = os.Stderr
err := c.Run()
Expand All @@ -269,6 +296,20 @@ func mkfifo(path string) error {
return nil
}

func initiateSoftReboot(mark string) error {
systemdjournal.Print(systemdjournal.PriInfo, "Processing soft-reboot request")
res := kola.KoletResult{
SoftReboot: string(mark),
}
buf, err := json.Marshal(&res)
if err != nil {
return errors.Wrapf(err, "serializing KoletResult")
}
fmt.Println(string(buf))
systemdjournal.Print(systemdjournal.PriInfo, "Acknowledged soft-reboot request with mark: %s", buf)
return nil
}

func runExtUnit(cmd *cobra.Command, args []string) error {
rebootOff, _ := cmd.Flags().GetBool("deny-reboots")
// Write the autopkgtest wrappers
Expand All @@ -278,10 +319,18 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
if err := os.WriteFile(autopkgTestRebootPreparePath, []byte(autopkgtestRebootPrepareScript), 0755); err != nil {
return err
}
// Write the soft-reboot autopkgtest wrappers
if err := os.WriteFile(autopkgTestSoftRebootPath, []byte(autopkgtestSoftRebootScript), 0755); err != nil {
return err
}
if err := os.WriteFile(autopkgTestSoftRebootPreparePath, []byte(autopkgtestSoftRebootPrepareScript), 0755); err != nil {
return err
}

// Create the reboot cmdline -> login FIFO for the reboot mark and
// proxy it into a channel
rebootChan := make(chan string)
softRebootChan := make(chan string)
errChan := make(chan error)

// We want to prevent certain tests (like non-exclusive tests) from rebooting
Expand All @@ -303,6 +352,25 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
}
rebootChan <- string(buf)
}()

// Create soft-reboot FIFO and channel
err = mkfifo(softRebootRequestFifo)
if err != nil {
return err
}
go func() {
softRebootReader, err := os.Open(softRebootRequestFifo)
if err != nil {
errChan <- err
return
}
defer softRebootReader.Close()
buf, err := io.ReadAll(softRebootReader)
if err != nil {
errChan <- err
}
softRebootChan <- string(buf)
}()
}

ctx := context.Background()
Expand Down Expand Up @@ -344,6 +412,8 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
return err
case reboot := <-rebootChan:
return initiateReboot(reboot)
case softReboot := <-softRebootChan:
return initiateSoftReboot(softReboot)
case m := <-unitevents:
for n := range m {
if n == unitname {
Expand Down Expand Up @@ -397,6 +467,35 @@ func runReboot(cmd *cobra.Command, args []string) error {
return nil
}

// runSoftReboot handles soft-reboot requests similar to runReboot but for systemctl soft-reboot
func runSoftReboot(cmd *cobra.Command, args []string) error {
if _, err := os.Stat(softRebootRequestFifo); os.IsNotExist(err) {
return errors.New("Soft-reboots are not supported for this test, softRebootRequestFifo does not exist.")
}

mark := args[0]
systemdjournal.Print(systemdjournal.PriInfo, "Requesting soft-reboot with mark: %s", mark)
err := mkfifo(kola.KoletRebootAckFifo)
if err != nil {
return err
}
err = os.WriteFile(softRebootRequestFifo, []byte(mark), 0644)
if err != nil {
return err
}
f, err := os.Open(kola.KoletRebootAckFifo)
if err != nil {
return err
}
buf := make([]byte, 1)
_, err = f.Read(buf)
if err != nil {
return err
}
systemdjournal.Print(systemdjournal.PriInfo, "Soft-reboot request acknowledged")
return nil
}

func runHttpd(cmd *cobra.Command, args []string) error {
port, _ := cmd.Flags().GetString("port")
path, _ := cmd.Flags().GetString("path")
Expand All @@ -413,6 +512,8 @@ func main() {
root.AddCommand(cmdRunExtUnit)
cmdReboot.Args = cobra.ExactArgs(1)
root.AddCommand(cmdReboot)
cmdSoftReboot.Args = cobra.ExactArgs(1)
root.AddCommand(cmdSoftReboot)
cmdHttpd.Flags().StringP("port", "", "80", "port")
cmdHttpd.Flags().StringP("path", "", "./", "path to filesystem contents to serve")
cmdHttpd.Args = cobra.ExactArgs(0)
Expand Down
59 changes: 42 additions & 17 deletions mantle/kola/harness.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ const (

// KoletResult is serialized JSON passed from kolet to the harness
type KoletResult struct {
Reboot string
Reboot string
SoftReboot string
}

const KoletExtTestUnit = "kola-runext"
Expand Down Expand Up @@ -1105,6 +1106,10 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
if err != nil {
return errors.Wrapf(err, "getting boot id")
}
softrebootCount, err := platform.GetMachineSoftRebootCount(mach)
if err != nil {
return errors.Wrapf(err, "getting soft reboot count")
}
plog.Debug("Starting kolet run-test-unit")
if previousRebootState != "" {
// quote around the value for systemd
Expand Down Expand Up @@ -1137,27 +1142,47 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
return errors.Wrapf(err, "parsing kolet json %s", string(stdout))
}
}
// If no reboot is requested, we're done
if koletRes.Reboot == "" {
// If no reboot or soft-reboot is requested, we're done
if koletRes.Reboot == "" && koletRes.SoftReboot == "" {
return nil
}

// A reboot is requested
previousRebootState = koletRes.Reboot
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
// This signals to the subject that we have saved the mark, and the subject
// can proceed with rebooting. We stop sshd to ensure that the wait below
// doesn't log in while ssh is shutting down.
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
if err != nil {
return errors.Wrapf(err, "failed to acknowledge reboot")
// Handle regular reboot
if koletRes.Reboot != "" {
previousRebootState = koletRes.Reboot
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
// This signals to the subject that we have saved the mark, and the subject
// can proceed with rebooting. We stop sshd to ensure that the wait below
// doesn't log in while ssh is shutting down.
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
if err != nil {
return errors.Wrapf(err, "failed to acknowledge reboot")
}
plog.Debug("Waiting for reboot")
err = mach.WaitForReboot(120*time.Second, bootID)
if err != nil {
return errors.Wrapf(err, "Waiting for reboot")
}
plog.Debug("Reboot complete")
}
plog.Debug("Waiting for reboot")
err = mach.WaitForReboot(120*time.Second, bootID)
if err != nil {
return errors.Wrapf(err, "Waiting for reboot")

// Handle soft-reboot
if koletRes.SoftReboot != "" {
previousRebootState = koletRes.SoftReboot
plog.Debugf("Soft-reboot request with mark='%s'", previousRebootState)
// Use the soft reboot count we collected at the beginning of this loop iteration
// Acknowledge the soft-reboot request
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'echo > %s'", KoletRebootAckFifo))
if err != nil {
return errors.Wrapf(err, "failed to acknowledge soft-reboot")
}
plog.Debug("Waiting for soft-reboot")
err = mach.WaitForSoftReboot(120*time.Second, softrebootCount)
if err != nil {
return errors.Wrapf(err, "Waiting for soft-reboot")
}
plog.Debug("Soft-reboot complete")
}
plog.Debug("Reboot complete")
}
}

Expand Down
4 changes: 4 additions & 0 deletions mantle/platform/machine/aws/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
return platform.WaitForMachineReboot(am, am.journal, timeout, oldBootId)
}

func (am *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
return platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldSoftRebootsCount)
}

func (am *machine) Destroy() {
origConsole, err := am.cluster.flight.api.GetConsoleOutput(am.ID())
if err != nil {
Expand Down
9 changes: 9 additions & 0 deletions mantle/platform/machine/azure/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
return am.refetchIPs()
}

func (am *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
err := platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldSoftRebootsCount)
if err != nil {
return err
}
// For soft-reboot, IP addresses should not change, but let's refetch to be safe
return am.refetchIPs()
}

func (am *machine) Destroy() {
if err := am.saveConsole(); err != nil {
// log error, but do not fail to terminate instance
Expand Down
4 changes: 4 additions & 0 deletions mantle/platform/machine/do/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ func (dm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
return platform.WaitForMachineReboot(dm, dm.journal, timeout, oldBootId)
}

func (dm *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
return platform.WaitForMachineSoftReboot(dm, dm.journal, timeout, oldSoftRebootsCount)
}

func (dm *machine) Destroy() {
if err := dm.cluster.flight.api.DeleteDroplet(context.TODO(), dm.droplet.ID); err != nil {
plog.Errorf("Error deleting droplet %v: %v", dm.droplet.ID, err)
Expand Down
4 changes: 4 additions & 0 deletions mantle/platform/machine/esx/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ func (em *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
return platform.WaitForMachineReboot(em, em.journal, timeout, oldBootId)
}

func (em *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
return platform.WaitForMachineSoftReboot(em, em.journal, timeout, oldSoftRebootsCount)
}

func (em *machine) Destroy() {
if err := em.cluster.flight.api.TerminateDevice(em.ID()); err != nil {
plog.Errorf("Error terminating device %v: %v", em.ID(), err)
Expand Down
4 changes: 4 additions & 0 deletions mantle/platform/machine/gcloud/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ func (gm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
return platform.WaitForMachineReboot(gm, gm.journal, timeout, oldBootId)
}

func (gm *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
return platform.WaitForMachineSoftReboot(gm, gm.journal, timeout, oldSoftRebootsCount)
}

func (gm *machine) Destroy() {
if err := gm.saveConsole(); err != nil {
plog.Errorf("Error saving console for instance %v: %v", gm.ID(), err)
Expand Down
Loading
Loading