Skip to content

Commit 063ce46

Browse files
committed
wcow-process: Query Stats directly from shim (#1362)
This change adds in functionality to query statistics directly in the shim instead of reaching out to HCS. One of the main motivators behind this was poor performance for tallying up the private working set total for the container in HCS. HCS calls NtQuerySystemInformation with the class SystemProcessInformation which returns an array containing system information for every process running on the machine. They then grab the pids that are running in the container and filter down the entries in the array to only what's running in that silo and start tallying up the total. This doesn't work well as performance should get worse if more processes are running on the machine in general and not just in the container. All of the additional information besides the WorkingSetPrivateSize field is ignored as well which isn't great and is wasted work to fetch. HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private working set ourselves and ask for everything else separately. We can open the silo ourselves and do the same queries for the rest of the info, as well as calculating the private working set in a more efficient manner by: 1. Find the pids running in the silo 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access) 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2. This change additionally: - Changes the jobcontainers package to use this new way to calculate the private working set. - Change the query the StorageStats method in the jobobject package uses to grab IO counters to match what HCS queries. Signed-off-by: Daniel Canter <[email protected]> (cherry picked from commit 4a1216a) Signed-off-by: Daniel Canter <[email protected]>
1 parent 5a70918 commit 063ce46

16 files changed

Lines changed: 1733 additions & 38 deletions

File tree

internal/hcs/system.go

Lines changed: 179 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,22 @@ import (
44
"context"
55
"encoding/json"
66
"errors"
7+
"fmt"
78
"strings"
89
"sync"
910
"syscall"
11+
"time"
1012

1113
"github.com/Microsoft/hcsshim/internal/cow"
1214
"github.com/Microsoft/hcsshim/internal/hcs/schema1"
1315
hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
16+
"github.com/Microsoft/hcsshim/internal/jobobject"
1417
"github.com/Microsoft/hcsshim/internal/log"
18+
"github.com/Microsoft/hcsshim/internal/logfields"
1519
"github.com/Microsoft/hcsshim/internal/oc"
1620
"github.com/Microsoft/hcsshim/internal/timeout"
1721
"github.com/Microsoft/hcsshim/internal/vmcompute"
22+
"github.com/sirupsen/logrus"
1823
"go.opencensus.io/trace"
1924
)
2025

@@ -28,7 +33,8 @@ type System struct {
2833
waitBlock chan struct{}
2934
waitError error
3035
exitError error
31-
os, typ string
36+
os, typ, owner string
37+
startTime time.Time
3238
}
3339

3440
func newSystem(id string) *System {
@@ -38,6 +44,11 @@ func newSystem(id string) *System {
3844
}
3945
}
4046

47+
// Implementation detail for silo naming, this should NOT be relied upon very heavily.
48+
func siloNameFmt(containerID string) string {
49+
return fmt.Sprintf(`\Container_%s`, containerID)
50+
}
51+
4152
// CreateComputeSystem creates a new compute system with the given configuration but does not start it.
4253
func CreateComputeSystem(ctx context.Context, id string, hcsDocumentInterface interface{}) (_ *System, err error) {
4354
operation := "hcs::CreateComputeSystem"
@@ -127,6 +138,7 @@ func (computeSystem *System) getCachedProperties(ctx context.Context) error {
127138
}
128139
computeSystem.typ = strings.ToLower(props.SystemType)
129140
computeSystem.os = strings.ToLower(props.RuntimeOSType)
141+
computeSystem.owner = strings.ToLower(props.Owner)
130142
if computeSystem.os == "" && computeSystem.typ == "container" {
131143
// Pre-RS5 HCS did not return the OS, but it only supported containers
132144
// that ran Windows.
@@ -195,7 +207,7 @@ func (computeSystem *System) Start(ctx context.Context) (err error) {
195207
if err != nil {
196208
return makeSystemError(computeSystem, operation, err, events)
197209
}
198-
210+
computeSystem.startTime = time.Now()
199211
return nil
200212
}
201213

@@ -324,11 +336,115 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr
324336
return properties, nil
325337
}
326338

327-
// PropertiesV2 returns the requested container properties targeting a V2 schema container.
328-
func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) {
329-
computeSystem.handleLock.RLock()
330-
defer computeSystem.handleLock.RUnlock()
339+
// queryInProc handles querying for container properties without reaching out to HCS. `props`
340+
// will be updated to contain any data returned from the queries present in `types`. If any properties
341+
// failed to be queried they will be tallied up and returned in as the first return value. Failures on
342+
// query are NOT considered errors; the only failure case for this method is if the containers job object
343+
// cannot be opened.
344+
func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.Properties, types []hcsschema.PropertyType) ([]hcsschema.PropertyType, error) {
345+
// In the future we can make use of some new functionality in the HCS that allows you
346+
// to pass a job object for HCS to use for the container. Currently, the only way we'll
347+
// be able to open the job/silo is if we're running as SYSTEM.
348+
jobOptions := &jobobject.Options{
349+
UseNTVariant: true,
350+
Name: siloNameFmt(computeSystem.id),
351+
}
352+
job, err := jobobject.Open(ctx, jobOptions)
353+
if err != nil {
354+
return nil, err
355+
}
356+
defer job.Close()
357+
358+
var fallbackQueryTypes []hcsschema.PropertyType
359+
for _, propType := range types {
360+
switch propType {
361+
case hcsschema.PTStatistics:
362+
// Handle a bad caller asking for the same type twice. No use in re-querying if this is
363+
// filled in already.
364+
if props.Statistics == nil {
365+
props.Statistics, err = computeSystem.statisticsInProc(job)
366+
if err != nil {
367+
log.G(ctx).WithError(err).Warn("failed to get statistics in-proc")
368+
369+
fallbackQueryTypes = append(fallbackQueryTypes, propType)
370+
}
371+
}
372+
default:
373+
fallbackQueryTypes = append(fallbackQueryTypes, propType)
374+
}
375+
}
376+
377+
return fallbackQueryTypes, nil
378+
}
379+
380+
// statisticsInProc emulates what HCS does to grab statistics for a given container with a small
381+
// change to make grabbing the private working set total much more efficient.
382+
func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcsschema.Statistics, error) {
383+
// Start timestamp for these stats before we grab them to match HCS
384+
timestamp := time.Now()
385+
386+
memInfo, err := job.QueryMemoryStats()
387+
if err != nil {
388+
return nil, err
389+
}
390+
391+
processorInfo, err := job.QueryProcessorStats()
392+
if err != nil {
393+
return nil, err
394+
}
395+
396+
storageInfo, err := job.QueryStorageStats()
397+
if err != nil {
398+
return nil, err
399+
}
400+
401+
// This calculates the private working set more efficiently than HCS does. HCS calls NtQuerySystemInformation
402+
// with the class SystemProcessInformation which returns an array containing system information for *every*
403+
// process running on the machine. They then grab the pids that are running in the container and filter down
404+
// the entries in the array to only what's running in that silo and start tallying up the total. This doesn't
405+
// work well as performance should get worse if more processess are running on the machine in general and not
406+
// just in the container. All of the additional information besides the WorkingSetPrivateSize field is ignored
407+
// as well which isn't great and is wasted work to fetch.
408+
//
409+
// HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private
410+
// working set ourselves and ask for everything else seperately. The optimization we can make here is
411+
// to open the silo ourselves and do the same queries for the rest of the info, as well as calculating
412+
// the private working set in a more efficient manner by:
413+
//
414+
// 1. Find the pids running in the silo
415+
// 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access)
416+
// 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters
417+
// 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2.
418+
privateWorkingSet, err := job.QueryPrivateWorkingSet()
419+
if err != nil {
420+
return nil, err
421+
}
331422

423+
return &hcsschema.Statistics{
424+
Timestamp: timestamp,
425+
ContainerStartTime: computeSystem.startTime,
426+
Uptime100ns: uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100,
427+
Memory: &hcsschema.MemoryStats{
428+
MemoryUsageCommitBytes: memInfo.JobMemory,
429+
MemoryUsageCommitPeakBytes: memInfo.PeakJobMemoryUsed,
430+
MemoryUsagePrivateWorkingSetBytes: privateWorkingSet,
431+
},
432+
Processor: &hcsschema.ProcessorStats{
433+
RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime),
434+
RuntimeUser100ns: uint64(processorInfo.TotalUserTime),
435+
TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime),
436+
},
437+
Storage: &hcsschema.StorageStats{
438+
ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount),
439+
ReadSizeBytes: storageInfo.ReadStats.TotalSize,
440+
WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount),
441+
WriteSizeBytes: storageInfo.WriteStats.TotalSize,
442+
},
443+
}, nil
444+
}
445+
446+
// hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types.
447+
func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) {
332448
operation := "hcs::System::PropertiesV2"
333449

334450
queryBytes, err := json.Marshal(hcsschema.PropertyQuery{PropertyTypes: types})
@@ -345,12 +461,66 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem
345461
if propertiesJSON == "" {
346462
return nil, ErrUnexpectedValue
347463
}
348-
properties := &hcsschema.Properties{}
349-
if err := json.Unmarshal([]byte(propertiesJSON), properties); err != nil {
464+
props := &hcsschema.Properties{}
465+
if err := json.Unmarshal([]byte(propertiesJSON), props); err != nil {
350466
return nil, makeSystemError(computeSystem, operation, err, nil)
351467
}
352468

353-
return properties, nil
469+
return props, nil
470+
}
471+
472+
// PropertiesV2 returns the requested compute systems properties targeting a V2 schema compute system.
473+
func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) {
474+
computeSystem.handleLock.RLock()
475+
defer computeSystem.handleLock.RUnlock()
476+
477+
// Let HCS tally up the total for VM based queries instead of querying ourselves.
478+
if computeSystem.typ != "container" {
479+
return computeSystem.hcsPropertiesV2Query(ctx, types)
480+
}
481+
482+
// Define a starter Properties struct with the default fields returned from every
483+
// query. Owner is only returned from Statistics but it's harmless to include.
484+
properties := &hcsschema.Properties{
485+
Id: computeSystem.id,
486+
SystemType: computeSystem.typ,
487+
RuntimeOsType: computeSystem.os,
488+
Owner: computeSystem.owner,
489+
}
490+
491+
logEntry := log.G(ctx)
492+
// First lets try and query ourselves without reaching to HCS. If any of the queries fail
493+
// we'll take note and fallback to querying HCS for any of the failed types.
494+
fallbackTypes, err := computeSystem.queryInProc(ctx, properties, types)
495+
if err == nil && len(fallbackTypes) == 0 {
496+
return properties, nil
497+
} else if err != nil {
498+
logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %w", err))
499+
fallbackTypes = types
500+
}
501+
502+
logEntry.WithFields(logrus.Fields{
503+
logfields.ContainerID: computeSystem.id,
504+
"propertyTypes": fallbackTypes,
505+
}).Info("falling back to HCS for property type queries")
506+
507+
hcsProperties, err := computeSystem.hcsPropertiesV2Query(ctx, fallbackTypes)
508+
if err != nil {
509+
return nil, err
510+
}
511+
512+
// Now add in anything that we might have successfully queried in process.
513+
if properties.Statistics != nil {
514+
hcsProperties.Statistics = properties.Statistics
515+
hcsProperties.Owner = properties.Owner
516+
}
517+
518+
// For future support for querying processlist in-proc as well.
519+
if properties.ProcessList != nil {
520+
hcsProperties.ProcessList = properties.ProcessList
521+
}
522+
523+
return hcsProperties, nil
354524
}
355525

356526
// Pause pauses the execution of the computeSystem. This feature is not enabled in TP5.

internal/jobcontainers/jobcontainer.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,9 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop
381381
return nil, errors.New("PTStatistics is the only supported property type for job containers")
382382
}
383383

384+
// Start timestamp before we grab the stats to match HCS' behavior
385+
timestamp := time.Now()
386+
384387
memInfo, err := c.job.QueryMemoryStats()
385388
if err != nil {
386389
return nil, errors.Wrap(err, "failed to query for job containers memory information")
@@ -396,18 +399,15 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop
396399
return nil, errors.Wrap(err, "failed to query for job containers storage information")
397400
}
398401

399-
var privateWorkingSet uint64
400-
err = forEachProcessInfo(c.job, func(procInfo *winapi.SYSTEM_PROCESS_INFORMATION) {
401-
privateWorkingSet += uint64(procInfo.WorkingSetPrivateSize)
402-
})
402+
privateWorkingSet, err := c.job.QueryPrivateWorkingSet()
403403
if err != nil {
404-
return nil, errors.Wrap(err, "failed to get private working set for container")
404+
return nil, fmt.Errorf("failed to get private working set for container: %w", err)
405405
}
406406

407407
return &hcsschema.Properties{
408408
Statistics: &hcsschema.Statistics{
409-
Timestamp: time.Now(),
410-
Uptime100ns: uint64(time.Since(c.startTimestamp)) / 100,
409+
Timestamp: timestamp,
410+
Uptime100ns: uint64(time.Since(c.startTimestamp).Nanoseconds()) / 100,
411411
ContainerStartTime: c.startTimestamp,
412412
Memory: &hcsschema.MemoryStats{
413413
MemoryUsageCommitBytes: memInfo.JobMemory,
@@ -420,10 +420,10 @@ func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.Prop
420420
TotalRuntime100ns: uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime),
421421
},
422422
Storage: &hcsschema.StorageStats{
423-
ReadCountNormalized: storageInfo.IoInfo.ReadOperationCount,
424-
ReadSizeBytes: storageInfo.IoInfo.ReadTransferCount,
425-
WriteCountNormalized: storageInfo.IoInfo.WriteOperationCount,
426-
WriteSizeBytes: storageInfo.IoInfo.WriteTransferCount,
423+
ReadCountNormalized: uint64(storageInfo.ReadStats.IoCount),
424+
ReadSizeBytes: storageInfo.ReadStats.TotalSize,
425+
WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount),
426+
WriteSizeBytes: storageInfo.WriteStats.TotalSize,
427427
},
428428
},
429429
}, nil

internal/jobobject/jobobject.go

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -416,18 +416,20 @@ func (job *JobObject) QueryProcessorStats() (*winapi.JOBOBJECT_BASIC_ACCOUNTING_
416416
}
417417

418418
// QueryStorageStats gets the storage (I/O) stats for the job object.
419-
func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_BASIC_AND_IO_ACCOUNTING_INFORMATION, error) {
419+
func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION, error) {
420420
job.handleLock.RLock()
421421
defer job.handleLock.RUnlock()
422422

423423
if job.handle == 0 {
424424
return nil, ErrAlreadyClosed
425425
}
426426

427-
info := winapi.JOBOBJECT_BASIC_AND_IO_ACCOUNTING_INFORMATION{}
427+
info := winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION{
428+
ControlFlags: winapi.JOBOBJECT_IO_ATTRIBUTION_CONTROL_ENABLE,
429+
}
428430
if err := winapi.QueryInformationJobObject(
429431
job.handle,
430-
winapi.JobObjectBasicAndIoAccountingInformation,
432+
winapi.JobObjectIoAttribution,
431433
uintptr(unsafe.Pointer(&info)),
432434
uint32(unsafe.Sizeof(info)),
433435
nil,
@@ -436,3 +438,62 @@ func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_BASIC_AND_IO_ACCOUN
436438
}
437439
return &info, nil
438440
}
441+
442+
// QueryPrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the
443+
// private working set for every process running in the job.
444+
func (job *JobObject) QueryPrivateWorkingSet() (uint64, error) {
445+
pids, err := job.Pids()
446+
if err != nil {
447+
return 0, err
448+
}
449+
450+
openAndQueryWorkingSet := func(pid uint32) (uint64, error) {
451+
h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid)
452+
if err != nil {
453+
// Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a
454+
// case where one of the pids in the job exited before we open.
455+
return 0, nil
456+
}
457+
defer func() {
458+
_ = windows.Close(h)
459+
}()
460+
// Check if the process is actually running in the job still. There's a small chance
461+
// that the process could have exited and had its pid re-used between grabbing the pids
462+
// in the job and opening the handle to it above.
463+
var inJob int32
464+
if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil {
465+
// This shouldn't fail unless we have incorrect access rights which we control
466+
// here so probably best to error out if this failed.
467+
return 0, err
468+
}
469+
// Don't report stats for this process as it's not running in the job. This shouldn't be
470+
// an error condition though.
471+
if inJob == 0 {
472+
return 0, nil
473+
}
474+
475+
var vmCounters winapi.VM_COUNTERS_EX2
476+
status := winapi.NtQueryInformationProcess(
477+
h,
478+
winapi.ProcessVmCounters,
479+
uintptr(unsafe.Pointer(&vmCounters)),
480+
uint32(unsafe.Sizeof(vmCounters)),
481+
nil,
482+
)
483+
if !winapi.NTSuccess(status) {
484+
return 0, fmt.Errorf("failed to query information for process: %w", winapi.RtlNtStatusToDosError(status))
485+
}
486+
return uint64(vmCounters.PrivateWorkingSetSize), nil
487+
}
488+
489+
var jobWorkingSetSize uint64
490+
for _, pid := range pids {
491+
workingSet, err := openAndQueryWorkingSet(pid)
492+
if err != nil {
493+
return 0, err
494+
}
495+
jobWorkingSetSize += workingSet
496+
}
497+
498+
return jobWorkingSetSize, nil
499+
}

0 commit comments

Comments
 (0)