Skip to content

Commit db9908f

Browse files
authored
Time synchronization inside LCOW UVM (microsoft#1119)
Start time synchronization service in opengcs Changes to the opengcs to start the chronyd service after UVM boots. Signed-off-by: Amit Barve <[email protected]> * Signed-off-by: Amit Barve <[email protected]> * TimeSync service inside LCOW UVM. Add test to verify both chronyd running & disabled cases. Minor fixes in chronyd startup code. Signed-off-by: Amit Barve <[email protected]> * Run Chronyd with restart monitor Signed-off-by: Amit Barve <[email protected]> * Force chronyd to step update time if difference is big Signed-off-by: Amit Barve <[email protected]> * Fixes after rebase Signed-off-by: Amit Barve <[email protected]> * go mod vendor & tidy Signed-off-by: Amit Barve <[email protected]> * Use backoff package instead of manually calculating backoffs Signed-off-by: Amit Barve <[email protected]> * Rename gcs cmdline params, use io.ReadFull instead of io.Read Minor other fixes. Signed-off-by: Amit Barve <[email protected]> * go mod vendor Signed-off-by: Amit Barve <[email protected]> * Ignore err if file doesn't exist Signed-off-by: Amit Barve <[email protected]> * Use ioutil.ReadFile to read clock_name file Signed-off-by: Amit Barve <[email protected]> * minor fix Signed-off-by: Amit Barve <[email protected]> * Remove incorrect usage of backoff.MaxElapsedTime Signed-off-by: Amit Barve <[email protected]>
1 parent ddab09b commit db9908f

8 files changed

Lines changed: 217 additions & 1 deletion

File tree

cmd/gcs/main.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import (
88
"io"
99
"io/ioutil"
1010
"os"
11+
"os/exec"
12+
"path/filepath"
1113
"syscall"
1214
"time"
1315

@@ -17,9 +19,11 @@ import (
1719
"github.com/Microsoft/hcsshim/internal/guest/runtime/runc"
1820
"github.com/Microsoft/hcsshim/internal/guest/transport"
1921
"github.com/Microsoft/hcsshim/internal/oc"
22+
"github.com/cenkalti/backoff/v4"
2023
"github.com/containerd/cgroups"
2124
cgroupstats "github.com/containerd/cgroups/stats/v1"
2225
oci "github.com/opencontainers/runtime-spec/specs-go"
26+
"github.com/pkg/errors"
2327
"github.com/sirupsen/logrus"
2428
"go.opencensus.io/trace"
2529
)
@@ -81,6 +85,84 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre
8185
}
8286
}
8387

88+
// runWithRestartMonitor starts a command with given args and waits for it to exit. If the
89+
// command exit code is non-zero the command is restarted with with some back off delay.
90+
// Any stdout or stderr of the command will be split into lines and written as a log with
91+
// logrus standard logger. This function must be called in a separate goroutine.
92+
func runWithRestartMonitor(arg0 string, args ...string) {
93+
backoffSettings := backoff.NewExponentialBackOff()
94+
// After we hit 10 min retry interval keep retrying after every 10 mins instead of
95+
// continuing to increase retry interval.
96+
backoffSettings.MaxInterval = time.Minute * 10
97+
for {
98+
command := exec.Command(arg0, args...)
99+
if err := command.Run(); err != nil {
100+
logrus.WithFields(logrus.Fields{
101+
"error": err,
102+
"command": command.Args,
103+
}).Warn("restart monitor: run command returns error")
104+
}
105+
backOffTime := backoffSettings.NextBackOff()
106+
// since backoffSettings.MaxElapsedTime is set to 0 we will never receive backoff.Stop.
107+
time.Sleep(backOffTime)
108+
}
109+
110+
}
111+
112+
// startTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized. We
113+
// use a PTP device provided by the hypervisor as a source of correct time (instead of
114+
// using a network server). We need to create a configuration file that configures chronyd
115+
// to use the PTP device. The system can have multiple PTP devices so we identify the
116+
// correct PTP device by verifying that the `clock_name` of that device is `hyperv`.
117+
func startTimeSyncService() error {
118+
ptpClassDir, err := os.Open("/sys/class/ptp")
119+
if err != nil {
120+
return errors.Wrap(err, "failed to open PTP class directory")
121+
}
122+
123+
ptpDirList, err := ptpClassDir.Readdirnames(-1)
124+
if err != nil {
125+
return errors.Wrap(err, "failed to list PTP class directory")
126+
}
127+
128+
var ptpDirPath string
129+
found := false
130+
// The file ends with a new line
131+
expectedClockName := "hyperv\n"
132+
for _, ptpDirPath = range ptpDirList {
133+
clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name")
134+
buf, err := ioutil.ReadFile(clockNameFilePath)
135+
if err != nil && !os.IsNotExist(err) {
136+
return errors.Wrapf(err, "failed to read clock name file at %s", clockNameFilePath)
137+
}
138+
139+
if string(buf) == expectedClockName {
140+
found = true
141+
break
142+
}
143+
}
144+
145+
if !found {
146+
return errors.Errorf("no PTP device found with name \"%s\"", expectedClockName)
147+
}
148+
149+
// create chronyd config file
150+
ptpDevPath := filepath.Join("/dev", filepath.Base(ptpDirPath))
151+
// chronyd config file take from: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/time-sync
152+
chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0 stratum 2\nmakestep 0.1 -1\n", ptpDevPath)
153+
chronydConfPath := "/tmp/chronyd.conf"
154+
err = ioutil.WriteFile(chronydConfPath, []byte(chronydConfigString), 0644)
155+
if err != nil {
156+
return errors.Wrapf(err, "failed to create chronyd conf file %s", chronydConfPath)
157+
}
158+
159+
// start chronyd. Do NOT start chronyd as daemon because creating a daemon
160+
// involves double forking the restart monitor will attempt to restart chornyd
161+
// after the first fork child exits.
162+
go runWithRestartMonitor("chronyd", "-n", "-f", chronydConfPath)
163+
return nil
164+
}
165+
84166
func main() {
85167
startTime := time.Now()
86168
logLevel := flag.String("loglevel", "debug", "Logging Level: debug, info, warning, error, fatal, panic.")
@@ -92,6 +174,7 @@ func main() {
92174
v4 := flag.Bool("v4", false, "enable the v4 protocol support and v2 schema")
93175
rootMemReserveBytes := flag.Uint64("root-mem-reserve-bytes", 75*1024*1024, "the amount of memory reserved for the orchestration, the rest will be assigned to containers")
94176
gcsMemLimitBytes := flag.Uint64("gcs-mem-limit-bytes", 50*1024*1024, "the maximum amount of memory the gcs can use")
177+
disableTimeSync := flag.Bool("disable-time-sync", false, "If true do not run chronyd time synchronization service inside the UVM")
95178

96179
flag.Usage = func() {
97180
fmt.Fprintf(os.Stderr, "\nUsage of %s:\n", os.Args[0])
@@ -248,6 +331,13 @@ func main() {
248331
oomFile := os.NewFile(oom, "cefd")
249332
defer oomFile.Close()
250333

334+
// time synchronization service
335+
if !(*disableTimeSync) {
336+
if err = startTimeSyncService(); err != nil {
337+
logrus.WithError(err).Fatal("failed to start time synchronization service")
338+
}
339+
}
340+
251341
go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl)
252342
go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl)
253343
err = b.ListenAndServe(bridgeIn, bridgeOut)
@@ -256,4 +346,5 @@ func main() {
256346
logrus.ErrorKey: err,
257347
}).Fatal("failed to serve gcs service")
258348
}
349+
259350
}

internal/oci/uvm.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (
358358
lopts.SecurityPolicy = parseAnnotationsString(s.Annotations, annotations.SecurityPolicy, lopts.SecurityPolicy)
359359
lopts.KernelBootOptions = parseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions)
360360
lopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, lopts.ProcessDumpLocation)
361+
lopts.DisableTimeSyncService = parseAnnotationsBool(ctx, s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService)
361362
handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts)
362363
handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts)
363364

internal/uvm/create_lcow.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ type OptionsLCOW struct {
103103
SecurityPolicyEnabled bool // Set when there is a security policy to apply on actual SNP hardware, use this rathen than checking the string length
104104
UseGuestStateFile bool // Use a vmgs file that contains a kernel and initrd, required for SNP
105105
GuestStateFile string // The vmgs file to load
106+
DisableTimeSyncService bool // Disables the time synchronization service
106107
}
107108

108109
// defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW
@@ -152,6 +153,7 @@ func NewDefaultOptionsLCOW(id, owner string) *OptionsLCOW {
152153
SecurityPolicyEnabled: false,
153154
SecurityPolicy: "",
154155
GuestStateFile: "",
156+
DisableTimeSyncService: false,
155157
}
156158

157159
if _, err := os.Stat(filepath.Join(opts.BootFilesPath, VhdFile)); err == nil {
@@ -651,6 +653,10 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs
651653
initArgs += fmt.Sprintf(" -e %d", linuxLogVsockPort)
652654
}
653655

656+
if opts.DisableTimeSyncService {
657+
opts.ExecCommandLine = fmt.Sprintf("%s --disable-time-sync", opts.ExecCommandLine)
658+
}
659+
654660
initArgs += " " + opts.ExecCommandLine
655661

656662
if opts.ProcessDumpLocation != "" {

pkg/annotations/annotations.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,4 +257,8 @@ const (
257257

258258
// GuestStateFile specifies the path of the vmgs file to use if required. Only applies in SNP mode.
259259
GuestStateFile = "io.microsoft.virtualmachine.lcow.gueststatefile"
260+
261+
// AnnotationDisableLCOWTimeSyncService is used to disable the chronyd time
262+
// synchronization service inside the LCOW UVM.
263+
DisableLCOWTimeSyncService = "io.microsoft.virtualmachine.lcow.timesync.disable"
260264
)

test/cri-containerd/runpodsandbox_test.go

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@ package cri_containerd
44

55
import (
66
"bufio"
7+
"bytes"
78
"context"
89
"fmt"
9-
"github.com/Microsoft/hcsshim/pkg/annotations"
10+
"io"
1011
"io/ioutil"
1112
"os"
1213
"path/filepath"
@@ -19,8 +20,11 @@ import (
1920
"github.com/Microsoft/hcsshim/internal/hcs"
2021
"github.com/Microsoft/hcsshim/internal/lcow"
2122
"github.com/Microsoft/hcsshim/internal/processorinfo"
23+
"github.com/Microsoft/hcsshim/internal/shimdiag"
2224
"github.com/Microsoft/hcsshim/osversion"
25+
"github.com/Microsoft/hcsshim/pkg/annotations"
2326
testutilities "github.com/Microsoft/hcsshim/test/functional/utilities"
27+
"github.com/containerd/containerd/log"
2428
runtime "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
2529
)
2630

@@ -1050,6 +1054,14 @@ func Test_RunPodSandbox_CPUGroup(t *testing.T) {
10501054
}
10511055

10521056
func createExt4VHD(ctx context.Context, t *testing.T, path string) {
1057+
// UVM related functions called below produce a lot debug logs. Set the logger
1058+
// output to Discard if verbose flag is not set. This way we can still capture
1059+
// these logs in a wpr session.
1060+
if !testing.Verbose() {
1061+
origLogOut := log.L.Logger.Out
1062+
log.L.Logger.SetOutput(io.Discard)
1063+
defer log.L.Logger.SetOutput(origLogOut)
1064+
}
10531065
uvm := testutilities.CreateLCOWUVM(ctx, t, t.Name()+"-createExt4VHD")
10541066
defer uvm.Close()
10551067

@@ -1708,3 +1720,94 @@ func Test_RunPodSandbox_KernelOptions_LCOW(t *testing.T) {
17081720
t.Fatalf("Expected number of hugepages to be 10. Got output instead: %d", numOfHugePages)
17091721
}
17101722
}
1723+
1724+
func Test_RunPodSandbox_TimeSyncService(t *testing.T) {
1725+
requireFeatures(t, featureLCOW)
1726+
1727+
client := newTestRuntimeClient(t)
1728+
ctx, cancel := context.WithCancel(context.Background())
1729+
defer cancel()
1730+
1731+
pullRequiredLCOWImages(t, []string{imageLcowK8sPause})
1732+
1733+
request := getRunPodSandboxRequest(
1734+
t,
1735+
lcowRuntimeHandler)
1736+
1737+
podID := runPodSandbox(t, client, ctx, request)
1738+
defer removePodSandbox(t, client, ctx, podID)
1739+
defer stopPodSandbox(t, client, ctx, podID)
1740+
1741+
shimName := fmt.Sprintf("k8s.io-%s", podID)
1742+
1743+
shim, err := shimdiag.GetShim(shimName)
1744+
if err != nil {
1745+
t.Fatalf("failed to find shim %s: %s", shimName, err)
1746+
}
1747+
1748+
psCmd := []string{"ps"}
1749+
shimClient := shimdiag.NewShimDiagClient(shim)
1750+
outBuf := bytes.Buffer{}
1751+
outw := bufio.NewWriter(&outBuf)
1752+
errBuf := bytes.Buffer{}
1753+
errw := bufio.NewWriter(&errBuf)
1754+
exitCode, err := execInHost(ctx, shimClient, psCmd, nil, outw, errw)
1755+
if err != nil {
1756+
t.Fatalf("failed to exec `%s` in the uvm with %s", psCmd[0], err)
1757+
}
1758+
if exitCode != 0 {
1759+
t.Fatalf("exec `%s` in the uvm failed with exit code: %d, std error: %s", psCmd[0], exitCode, errBuf.String())
1760+
}
1761+
if !strings.Contains(outBuf.String(), "chronyd") {
1762+
t.Logf("standard output of exec %s is: %s\n", psCmd[0], outBuf.String())
1763+
t.Fatalf("chronyd is not running inside the uvm")
1764+
}
1765+
}
1766+
1767+
func Test_RunPodSandbox_DisableTimeSyncService(t *testing.T) {
1768+
requireFeatures(t, featureLCOW)
1769+
1770+
client := newTestRuntimeClient(t)
1771+
ctx, cancel := context.WithCancel(context.Background())
1772+
defer cancel()
1773+
1774+
pullRequiredLCOWImages(t, []string{imageLcowK8sPause})
1775+
1776+
request := getRunPodSandboxRequest(
1777+
t,
1778+
lcowRuntimeHandler,
1779+
WithSandboxAnnotations(
1780+
map[string]string{
1781+
annotations.DisableLCOWTimeSyncService: "true",
1782+
}),
1783+
)
1784+
1785+
podID := runPodSandbox(t, client, ctx, request)
1786+
defer removePodSandbox(t, client, ctx, podID)
1787+
defer stopPodSandbox(t, client, ctx, podID)
1788+
1789+
shimName := fmt.Sprintf("k8s.io-%s", podID)
1790+
1791+
shim, err := shimdiag.GetShim(shimName)
1792+
if err != nil {
1793+
t.Fatalf("failed to find shim %s: %s", shimName, err)
1794+
}
1795+
1796+
psCmd := []string{"ps"}
1797+
shimClient := shimdiag.NewShimDiagClient(shim)
1798+
outBuf := bytes.Buffer{}
1799+
outw := bufio.NewWriter(&outBuf)
1800+
errBuf := bytes.Buffer{}
1801+
errw := bufio.NewWriter(&errBuf)
1802+
exitCode, err := execInHost(ctx, shimClient, psCmd, nil, outw, errw)
1803+
if err != nil {
1804+
t.Fatalf("failed to exec `%s` in the uvm with %s", psCmd[0], err)
1805+
}
1806+
if exitCode != 0 {
1807+
t.Fatalf("exec `%s` in the uvm failed with exit code: %d, std error: %s", psCmd[0], exitCode, errBuf.String())
1808+
}
1809+
if strings.Contains(outBuf.String(), "chronyd") {
1810+
t.Logf("standard output of exec %s is: %s\n", psCmd[0], outBuf.String())
1811+
t.Fatalf("chronyd should not be running inside the uvm")
1812+
}
1813+
}

test/vendor/github.com/Microsoft/hcsshim/internal/oci/uvm.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/vendor/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/vendor/github.com/Microsoft/hcsshim/pkg/annotations/annotations.go

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)